In [36]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns

## Load Data

In [360]:
path = '/Users/ridleyleisy/Documents/lambda/unit_two/DS-Unit-2-Classification-1/ds4-predictive-modeling-challenge/'

In [559]:
train = pd.read_csv(path + 'train_features.csv')
test = pd.read_csv(path + 'test_features.csv')
labels = pd.read_csv(path + 'train_labels.csv')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')

## clean data

### numeric

In [560]:
def drop_cols_rows(df):
    df.drop('num_private',axis=1,inplace=True)
    df.drop('recorded_by',axis=1,inplace=True)
    return df

In [561]:
train = drop_cols_rows(train)

#### replace longitude

In [562]:
#test whether you want to keep longitude
long_mean = train.loc[train['longitude'] !=0]['longitude'].mean()
train['longitude'].replace(0,long_mean,inplace=True)
test['longitude'].replace(0,long_mean,inplace=True)

#### drop longitude

In [563]:
#test whether you want to drop longitude from dataset
train = train.loc[train['longitude'] != 0]
test = test.loc[test['longitude'] != 0]

#### reduce label size

In [564]:
labels = labels.merge(train,on='id')[['id','status_group']]

## Train construction year data

In [565]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import seaborn as sns

#### construction year predicted by random forest

In [566]:
train['construction_year'] = train['construction_year'].replace(0,np.nan)
test['construction_year'] = test['construction_year'].replace(0,np.nan)

In [567]:
def transform_construction(df):
    
    df = df.select_dtypes(include=np.number)
    X = df.loc[~df['construction_year'].isna()]
    
    # can only use these featuers since they differ 
    features = ['amount_tsh', 'gps_height', 'longitude', 'latitude',
       'region_code', 'district_code', 'population']
    target = 'construction_year'
    
    X_train = X[features]
    y_train = X[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X_train,y_train)
    m = RandomForestRegressor(n_estimators=20,max_depth=25)
    m.fit(X_train, y_train)
    
    vals = m.predict(df.loc[df['construction_year'].isna()][features])
    
    return  vals

In [569]:
train.loc[train['construction_year'].isna(),'construction_year'] = transform_construction(train)
test.loc[test['construction_year'].isna(),'construction_year'] = transform_construction(test)
train['construction_year'] = round(train['construction_year'])
test['construction_year'] = round(test['construction_year'])

#### construction year based on distribution

In [371]:
# getting distributions for construction years
dist = train['construction_year'].value_counts(normalize=True)
dist_test = test['construction_year'].value_counts(normalize=True)

In [372]:
# applying distribution of construction year to na values for train and test sets
train.loc[train['construction_year'].isna(),'construction_year'] = np.random.choice(dist.index, size=train['construction_year'].isna().sum(),p=dist.values)

In [373]:
test.loc[test['construction_year'].isna(),'construction_year'] = np.random.choice(dist_test.index, size=test['construction_year'].isna().sum(),p=dist_test.values)

### feature engineering

In [570]:
def add_construction_diff(df):
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['construction_year'] = pd.to_datetime(df['construction_year'],format="%Y")
    df['time_since_construction'] = (df['date_recorded'] - df['construction_year']).dt.days
    df.loc[df['time_since_construction'] < 0,'time_since_construction'] = 0    
    df['construction_year'] = df['construction_year'].dt.year
    return df

In [571]:
test = add_construction_diff(test)
train = add_construction_diff(train)

## Encoding and Scaling Categorical Data

In [616]:
import category_encoders as ce
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [661]:
unique = train.describe(exclude=np.number).T.sort_values(by='unique')

In [573]:
cat_features = list(unique.loc[unique['unique'] < 130].index)

In [575]:
numeric_features = ['amount_tsh', 'gps_height', 'longitude', 'latitude','time_since_construction',
       'region_code', 'district_code', 'population']
encode_features = cat_features
features = numeric_features + encode_features

### Pipeline

In [663]:
encoder = ce.OneHotEncoder(use_cat_names=True)
scaler = RobustScaler(with_scaling=False)

In [664]:
X_train = train[features]
y_train = labels['status_group']

In [665]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, train_size=0.80, test_size=0.20, 
    stratify=y_train, random_state=42)

In [666]:
X_train_sub = X_train[features]
X_val_sub = X_val[features]

In [667]:
pipeline1 = Pipeline([('encoder',encoder),('scaler',scaler)])

In [668]:
pipeline1.fit_transform(X_train_sub)

array([[ 1.000e+00, -1.000e+00,  0.000e+00, ...,  0.000e+00,  1.000e+00,
        -2.500e+01],
       [ 0.000e+00,  0.000e+00,  0.000e+00, ..., -1.000e+00,  1.000e+00,
         1.500e+02],
       [ 0.000e+00, -1.000e+00,  1.000e+00, ..., -6.000e+00, -2.000e+00,
         2.275e+03],
       ...,
       [ 0.000e+00, -1.000e+00,  1.000e+00, ...,  7.800e+01,  3.000e+01,
        -2.400e+01],
       [ 0.000e+00, -1.000e+00,  1.000e+00, ...,  7.800e+01,  3.000e+01,
         2.950e+02],
       [ 0.000e+00,  0.000e+00,  0.000e+00, ..., -7.000e+00,  3.000e+00,
         6.500e+01]])

In [669]:
pipeline1.transform(X_val_sub)

array([[  0.,   0.,   0., ...,   6.,   4., -25.],
       [  0.,  -1.,   1., ...,   4.,  -1., -24.],
       [  1.,  -1.,   0., ...,  -9.,   0., 395.],
       ...,
       [  0.,   0.,   0., ...,   5.,   3., -25.],
       [  0.,   0.,   0., ...,  -8.,  -2., -24.],
       [  0.,   0.,   0., ...,  87.,  -2., 775.]])

## Logistical Regression

In [298]:
model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
model.fit(X_train_scaled, y_train)

KeyboardInterrupt: 

In [None]:
model.score(X_val_scaled,y_val)

## Decision Tree

In [389]:
from sklearn.tree import DecisionTreeClassifier

In [390]:
m = DecisionTreeClassifier(max_depth=25)

In [391]:
m.fit(X_train_scaled,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=25,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [392]:
m.score(X_val_scaled,y_val)

0.7585858585858586

In [393]:
m.score(X_train_scaled,y_train)

0.9402356902356902

In [394]:
pd.DataFrame(m.feature_importances_,X_train_sub_encoded.columns).sort_values(by=0).tail(10)

Unnamed: 0,0
district_code,0.010797
waterpoint_type_communal standpipe multiple,0.021741
amount_tsh,0.025102
population,0.040552
gps_height,0.053927
waterpoint_type_group_other,0.082631
time_since_construction,0.095104
latitude,0.115097
longitude,0.12842
quantity_group_dry,0.151526


## Random Forest

In [670]:
from sklearn.ensemble import RandomForestClassifier

In [688]:
m = RandomForestClassifier(n_estimators=20,max_depth=27,max_features='auto')

In [689]:
m.fit(X_train_scaled,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=27, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [690]:
m.score(X_val_scaled,y_val)

0.8024410774410774

In [691]:
m.score(X_train_scaled,y_train)

0.9598905723905724

In [699]:
pd.DataFrame(m.feature_importances_,X_train_sub_encoded.columns).sort_values(by=0)

Unnamed: 0,0
extraction_type_-1,0.000000
quality_group_-1,0.000000
quantity_-1,0.000000
scheme_management_-1,0.000000
management_-1,0.000000
waterpoint_type_group_-1,0.000000
quantity_group_-1,0.000000
extraction_type_class_-1,0.000000
region_-1,0.000000
payment_type_-1,0.000000


In [692]:
preds = m.predict(test_sub_scaled)

In [611]:
preds

array(['non functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [404]:
(pd.Series(m.feature_importances_,X_train_sub_encoded.columns)).sort_values()

payment_-1                                     0.000000
region_-1                                      0.000000
extraction_type_class_-1                       0.000000
extraction_type_-1                             0.000000
waterpoint_type_group_-1                       0.000000
basin_-1                                       0.000000
management_group_-1                            0.000000
waterpoint_type_-1                             0.000000
quantity_-1                                    0.000000
source_-1                                      0.000000
extraction_type_group_-1                       0.000000
quantity_group_-1                              0.000000
water_quality_-1                               0.000000
management_-1                                  0.000000
source_type_-1                                 0.000000
scheme_management_-1                           0.000000
public_meeting_-1                              0.000000
payment_type_-1                                0

In [323]:
X_train_sub_encoded

Unnamed: 0,permit_nan,permit_True,permit_False,permit_-1,public_meeting_True,public_meeting_False,public_meeting_nan,public_meeting_-1,source_class_groundwater,source_class_surface,...,lga_Arusha Urban,lga_-1,amount_tsh,gps_height,longitude,latitude,time_since_construction,region_code,district_code,population
43360,1,0,0,0,1,0,0,0,1,0,...,0,0,0.0,0,33.542898,-9.174777,7147,12,4,0
7263,0,1,0,0,1,0,0,0,1,0,...,0,0,500.0,2049,34.665760,-9.308548,1177,11,4,175
2486,0,0,1,0,1,0,0,0,1,0,...,0,0,25.0,290,38.238568,-6.179919,430,6,1,2300
313,0,1,0,0,1,0,0,0,1,0,...,0,0,0.0,0,30.716727,-1.289055,941,18,1,0
52726,0,1,0,0,1,0,0,0,1,0,...,0,0,0.0,0,35.389331,-6.399942,433,1,6,0
8558,0,1,0,0,1,0,0,0,0,1,...,0,0,0.0,1295,31.214583,-8.431428,9349,15,2,200
2559,0,1,0,0,1,0,0,0,1,0,...,0,0,20000.0,1515,36.696700,-3.337926,6820,2,2,150
54735,0,0,1,0,1,0,0,0,1,0,...,0,0,0.0,0,36.292724,-5.177333,836,1,1,0
25763,0,0,1,0,0,1,0,0,1,0,...,0,0,0.0,0,32.877248,-8.925921,14459,12,6,0
44540,0,1,0,0,1,0,0,0,1,0,...,0,0,0.0,0,33.014412,-3.115869,5327,19,7,0


## Predicting for Kaggle

In [466]:
test_sub = test[features]
test_sub_encoded = encoder.transform(test_sub)
test_sub_scaled = scaler.transform(test_sub_encoded)

In [1310]:
preds = model.predict(test_sub_scaled)

In [612]:
submission = pd.DataFrame(test['id'])

In [613]:
submission['status_group'] = preds

In [614]:
submission

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
5,52449,functional
6,24806,functional
7,28965,non functional
8,36301,non functional
9,54122,functional


In [615]:
submission.to_csv('test_submission.csv',index=False)