In [713]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns

## Load Data

In [884]:
path = '/Users/ridleyleisy/Documents/lambda/unit_two/DS-Unit-2-Classification-1/ds4-predictive-modeling-challenge/'

In [885]:
train = pd.read_csv(path + 'train_features.csv')
test = pd.read_csv(path + 'test_features.csv')
labels = pd.read_csv(path + 'train_labels.csv')

## clean data

### numeric

In [886]:
train.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [887]:
def drop_cols_rows(df):
    df.drop('num_private',axis=1,inplace=True)
    df.drop('recorded_by',axis=1,inplace=True)
    df = df.loc[df['longitude'] != 0]
    return df

In [888]:
train, test = drop_cols_rows(train), drop_cols_rows(test)

In [889]:
labels = labels.merge(train,on='id')[['id','status_group']]

## Train construction year data

In [890]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [891]:
def transform_construction(df):
    df['construction_year'] = df['construction_year'].replace(0,np.nan)
    df = df.select_dtypes(include=np.number)
    X = df.loc[~df['construction_year'].isna()]
    
    # can only use these featuers since they differ 
    features = ['amount_tsh', 'gps_height', 'longitude', 'latitude',
       'region_code', 'district_code', 'population']
    target = 'construction_year'
    
    X_train = X[features]
    y_train = X[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X_train,y_train)
    m = DecisionTreeRegressor(max_depth=20)
    m.fit(X_train,y_train)
    
    vals = m.predict(df.loc[df['construction_year'].isna()][features])
    
    return vals

In [892]:
train.loc[train['construction_year'].isna(),'construction_year'] = transform_construction(train)
test.loc[test['construction_year'].isna(),'construction_year'] = transform_construction(test)

In [893]:
train['construction_year'] = round(train['construction_year'])
test['construction_year'] = round(test['construction_year'])

### feature engineering

In [894]:
def add_construction_diff(df):
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['construction_year'] = pd.to_datetime(df['construction_year'],format="%Y")
    df['time_since_construction'] = (df['date_recorded'] - df['construction_year']).dt.days
    return df

In [895]:
test = add_construction_diff(test)
train = add_construction_diff(train)

## Encoding Categorical Data

In [896]:
train.describe(exclude=np.number).T.sort_values(by='unique')

Unnamed: 0,count,unique,top,freq,first,last
public_meeting,54612,2,True,49737,,
permit,54532,2,True,38100,,
source_class,57588,3,groundwater,44204,,
quantity,57588,5,enough,32260,,
quantity_group,57588,5,enough,32260,,
management_group,57588,5,user-group,50767,,
quality_group,57588,6,good,49431,,
waterpoint_type_group,57588,6,communal standpipe,34334,,
payment_type,57588,7,never pay,24380,,
source_type,57588,7,spring,17006,,


Let's encode
1. source_class
2. quantity
3. management_group
4. payment type
5. source type

In [897]:
features = ['amount_tsh', 'gps_height', 'longitude', 'latitude',
       'region_code', 'district_code', 'population',
       'source_class','quantity','management_group','payment','source']

In [898]:
encode_features = ['source_class','quantity','management_group','payment','source']

In [899]:
X_train = train[features]
y_train = labels['status_group']

In [900]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, train_size=0.80, test_size=0.20, 
    stratify=y_train, random_state=42)

In [901]:
import category_encoders as ce

In [902]:
encoded_features = ['source_class','quantity','management_group','payment','source']

In [903]:
encoder = ce.OneHotEncoder(use_cat_names=True)

In [904]:
from sklearn.preprocessing import StandardScaler

In [905]:
X_train_sub = X_train[features]
X_test_sub = X_test[features]

In [906]:
X_train_sub_encoded = encoder.fit_transform(X_train_sub)

In [907]:
X_test_sub_encoded = encoder.transform(X_test_sub)

In [908]:
scaler = StandardScaler()

In [909]:
X_train_scaled = scaler.fit_transform(X_train_sub_encoded)
X_test_scaled = scaler.transform(X_test_sub_encoded)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [910]:
X_train_sub_encoded

Unnamed: 0,source_class_surface,source_class_groundwater,source_class_unknown,source_class_-1,quantity_enough,quantity_seasonal,quantity_insufficient,quantity_dry,quantity_unknown,quantity_-1,...,source_other,source_unknown,source_-1,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population
51775,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0.0,0,35.701431,-6.414274,1,6,0
38852,0,1,0,0,1,0,0,0,0,0,...,0,0,0,6000.0,1290,34.960272,-9.869398,11,5,150
34004,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0.0,-12,38.896355,-7.927481,60,53,1
7913,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0.0,1565,34.697952,-1.900715,20,2,2000
7932,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0.0,0,33.058693,-8.573507,12,1,0
49064,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1000.0,379,36.861861,-9.020665,5,4,350
5229,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0.0,-13,38.851719,-5.782898,4,5,80
28296,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0.0,0,32.358642,-8.639323,12,6,0
10096,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0.0,0,35.754027,-6.352250,1,5,0
25363,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0.0,280,39.124346,-9.900209,80,53,620


In [911]:
X_train_scaled

array([[ 1.84396069, -1.81968359, -0.06863383, ..., -0.79561912,
         0.02912231, -0.38317407],
       [-0.54231091,  0.54954609, -0.06863383, ..., -0.23659891,
        -0.07354565, -0.07425573],
       [-0.54231091,  0.54954609, -0.06863383, ...,  2.50260015,
         4.8545165 , -0.38111462],
       ...,
       [ 1.84396069, -1.81968359, -0.06863383, ..., -0.29250093,
        -0.38154953,  0.85249929],
       [-0.54231091,  0.54954609, -0.06863383, ..., -0.12479486,
        -0.27888157,  0.13168983],
       [ 1.84396069, -1.81968359, -0.06863383, ...,  0.26651929,
        -0.38154953,  0.81131017]])

In [912]:
from sklearn.linear_model import LogisticRegression

In [913]:
model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
model.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [914]:
model.score(X_test_sub_encoded,y_test)

0.511373502344157

In [915]:
X_train_scaled

array([[ 1.84396069, -1.81968359, -0.06863383, ..., -0.79561912,
         0.02912231, -0.38317407],
       [-0.54231091,  0.54954609, -0.06863383, ..., -0.23659891,
        -0.07354565, -0.07425573],
       [-0.54231091,  0.54954609, -0.06863383, ...,  2.50260015,
         4.8545165 , -0.38111462],
       ...,
       [ 1.84396069, -1.81968359, -0.06863383, ..., -0.29250093,
        -0.38154953,  0.85249929],
       [-0.54231091,  0.54954609, -0.06863383, ..., -0.12479486,
        -0.27888157,  0.13168983],
       [ 1.84396069, -1.81968359, -0.06863383, ...,  0.26651929,
        -0.38154953,  0.81131017]])