In [26]:
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype

from ydata_profiling import ProfileReport
import numpy as np
import re
import requests
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy import stats
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
import warnings
import types

In [2]:
df_train = pd.read_csv('clean_train_dataset.csv')
df_train.head()

Unnamed: 0,id,amount_tsh,date_recorded,gps_height,longitude,latitude,wpt_name,num_private,basin,region,...,management_group,payment,payment_type,water_quality_group,quantity,source,source_type,source_class,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,1390,34.938093,-9.856322,none,0,Lake Nyasa,Iringa,...,user-group,pay annually,annually,good,enough,spring,spring,groundwater,communal standpipe,functional
1,8776,0.0,2013-03-06,1399,34.698766,-2.147466,Zahanati,0,Lake Victoria,Mara,...,user-group,never pay,never pay,good,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,functional
2,34310,25.0,2013-02-25,686,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Manyara,...,user-group,pay per bucket,per bucket,good,enough,dam,dam,surface,communal standpipe,functional
3,67743,0.0,2013-01-28,263,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mtwara,...,user-group,never pay,never pay,good,dry,machine dbh,borehole,groundwater,communal standpipe,non functional
4,19728,0.0,2011-07-13,0,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kagera,...,other,never pay,never pay,good,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,functional


In [3]:
df_test = pd.read_csv('clean_test_dataset.csv')
df_test.head()

Unnamed: 0,id,amount_tsh,date_recorded,gps_height,longitude,latitude,wpt_name,num_private,basin,region,...,management,management_group,payment,payment_type,water_quality_group,quantity,source,source_type,source_class,waterpoint_type_group
0,50785,0.0,2013-02-04,1996,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Manyara,...,parastatal,parastatal,never pay,never pay,good,seasonal,rainwater harvesting,rainwater harvesting,surface,other
1,51630,0.0,2013-02-04,1569,36.656709,-3.309214,Kimnyak,0,Pangani,Arusha,...,vwc,user-group,never pay,never pay,good,insufficient,spring,spring,groundwater,communal standpipe
2,17168,0.0,2013-02-01,1567,34.767863,-5.004344,Puma Secondary,0,Internal,Singida,...,vwc,user-group,never pay,never pay,good,insufficient,rainwater harvesting,rainwater harvesting,surface,other
3,45559,0.0,2013-01-22,267,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Lindi,...,vwc,user-group,unknown,unknown,good,dry,shallow well,shallow well,groundwater,other
4,49871,500.0,2013-03-27,1260,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Ruvuma,...,water board,user-group,pay monthly,monthly,good,enough,spring,spring,groundwater,communal standpipe


In [4]:
df_train.dtypes

id                         int64
amount_tsh               float64
date_recorded             object
gps_height                 int64
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
recorded_by               object
construction_year         object
extraction_type           object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
payment_type              object
water_quality_group       object
quantity                  object
source                    object
source_type               object
source_class              object
waterpoint_type_group     object
status_gro

In [6]:
# Find all columns that are objects.
cols = df_train.select_dtypes(exclude=[np.number])
list(cols)

['date_recorded',
 'wpt_name',
 'basin',
 'region',
 'lga',
 'ward',
 'recorded_by',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality_group',
 'quantity',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type_group',
 'status_group']

In [8]:
# Create dataframe of 'best parameters' to be converted into dummy variables. 
cols2 = df_train[['waterpoint_type_group', 'construction_year', 'extraction_type_class','management',
                'management_group', 'payment', 'water_quality_group', 'quantity', 'basin',
                'region', 'source', 'source_class']]
list(cols2)

['waterpoint_type_group',
 'construction_year',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'water_quality_group',
 'quantity',
 'basin',
 'region',
 'source',
 'source_class']

In [11]:
# create dummy variables for columns in cols2
dummy_columns = pd.get_dummies(cols2)
dummy_columns.head()

Unnamed: 0,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other,construction_year_1960s,construction_year_1970s,construction_year_1980s,construction_year_1990s,...,source_machine dbh,source_other,source_rainwater harvesting,source_river,source_shallow well,source_spring,source_unknown,source_class_groundwater,source_class_surface,source_class_unknown
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [10]:
df_train['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [12]:
y = df_train['status_group'].values
X = dummy_columns

In [17]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [18]:
from sklearn.discriminant_analysis import StandardScaler


ss = StandardScaler()
Xs = ss.fit_transform(X)

Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression


lor = LogisticRegression()
lor.fit(X_train, y_train)

lor.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7313131313131314

In [21]:
from sklearn.metrics import classification_report, confusion_matrix

y_preds = lor.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), 
             columns=['Pred +', 'Pred Fix', 'Pred -'], 
             index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.71      0.89      0.79      9682
functional needs repair       0.45      0.04      0.07      1342
         non functional       0.77      0.64      0.70      6796

               accuracy                           0.73     17820
              macro avg       0.65      0.52      0.52     17820
           weighted avg       0.72      0.73      0.70     17820



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,8633,33,1016
Act Fix,1042,51,249
Act -,2418,30,4348


In [23]:
preds = pd.DataFrame(y_preds)
n_test = pd.read_csv('clean_test_dataset.csv')
predict = pd.concat((n_test['id'], preds), axis=1)
predict.columns=['id', 'status_group']
predict.head()

Unnamed: 0,id,status_group
0,50785.0,non functional
1,51630.0,functional
2,17168.0,functional
3,45559.0,non functional
4,49871.0,non functional


Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


dt = RandomForestClassifier(class_weight = 'balanced')
s = cross_val_score(dt, X, y, n_jobs=-1)
print("{} Score:\t{:0.3} ± {:0.3}".format("Random Forest with Balanced Classes", s.mean().round(3), s.std().round(3)))

Random Forest with Balanced Classes Score:	0.702 ± 0.004


In [35]:
rfc_params = {'n_estimators':[2,5,10,20,50,75,150],
             'criterion':['gini', 'entropy'],
             'max_depth':[2,5,10,20,50,None],
             'min_samples_split':[2,5,10,20]}


grid_rfc = GridSearchCV(RandomForestClassifier(), rfc_params, cv=5, scoring='accuracy')
grid_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 5, 10, 20, 50, None],
                         'min_samples_split': [2, 5, 10, 20],
                         'n_estimators': [2, 5, 10, 20, 50, 75, 150]},
             scoring='accuracy')

In [36]:
grid_rfc.score(X_test, y_test)

0.7726711560044893

In [37]:
from sklearn.metrics import classification_report, confusion_matrix

y_preds = grid_rfc.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred +', 'Pred Fix', 'Pred -'], 
                                                         index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.75      0.91      0.82      9682
functional needs repair       0.55      0.15      0.24      1342
         non functional       0.83      0.70      0.76      6796

               accuracy                           0.77     17820
              macro avg       0.71      0.59      0.61     17820
           weighted avg       0.77      0.77      0.75     17820



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,8809,111,762
Act Fix,921,207,214
Act -,1986,57,4753


In [38]:

n_estimator = 10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [39]:
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, n_estimators=10)

In [40]:
grid_rfc.score(X_test, y_test)


0.802962962962963

In [41]:
y_preds = grid_rfc.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred +', 'Pred Fix', 'Pred -'], index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.78      0.93      0.85      8079
functional needs repair       0.65      0.20      0.31      1092
         non functional       0.87      0.73      0.80      5679

               accuracy                           0.80     14850
              macro avg       0.77      0.62      0.65     14850
           weighted avg       0.80      0.80      0.79     14850



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,7538,68,473
Act Fix,715,220,157
Act -,1464,49,4166


In [42]:
preds = pd.DataFrame(y_preds)
preds.head()

Unnamed: 0,0
0,functional
1,functional
2,functional
3,functional
4,functional


In [48]:
n_test = pd.read_csv('TestSetValues.csv')
n_test.head()

(14850, 40)

In [45]:

predict = pd.concat((n_test['id'], preds), axis=1)
predict.columns=['id', 'status_group']
predict['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [46]:
predict.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,functional
4,49871,functional
