In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md

In [45]:
def cval(X, y, X_resampled, y_resampled, cval, estimator):
    
    reports = []
    matrices = []
    numpy_reports = []
    numpy_matrices = []
    
    report_columns = ['functional', 'functional needs repair', 
                      'non functional', 'accuracy', 'macro avg', 
                      'weighted avg']
    
    report_rows = ['precision', 'recall', 
                   'f1-score', 'support']
    
    matrix_labels = ['functional', 'functional needs repair', 
                     'non functional']
    
    idx = list(X.index)
    np.random.shuffle(idx)
    
    for i in list(range(cval)):
        arrs = np.array_split(idx, cval)
        
        test = arrs.pop(i)
        train = np.concatenate(arrs)
        
        test_x = X.take(test)
        train_x = X.take(train)
        test_y = y.take(test)
        train_y = y.take(train)
        
        strategy = {'functional needs repair': int(10000*((cval-1) / (cval)))}
        smote = SMOTE(sampling_strategy=strategy)
        train_x_resampled, train_y_resampled = smote.fit_resample(train_x, train_y)
        
        model = estimator
        model.fit(train_x_resampled, train_y_resampled)
        preds = model.predict(test_x)
        
        report = pd.DataFrame(classification_report(test_y, preds, output_dict=True))
        
        reports.append(report)
        numpy_reports.append(np.array(report))
        
        matrix = pd.DataFrame(confusion_matrix(test_y, preds))
        matrices.append(matrix)
        numpy_matrices.append(np.array(matrix))
    
    numpy_report = pd.DataFrame(np.sum(numpy_reports, axis=0)/cval, 
                                columns=report_columns, index=report_rows)
    
    numpy_matrix = pd.DataFrame(np.sum(numpy_matrices, axis=0)/cval, 
                                columns=matrix_labels, index=matrix_labels)
    
    return reports, matrices, numpy_report, numpy_matrix

In [3]:
X_test = pd.read_csv("tanzanian_water_wells/X_test.csv")
X_train = pd.read_csv("tanzanian_water_wells/X_train.csv")
y_train = pd.read_csv("tanzanian_water_wells/y_train.csv")

df = pd.concat([X_train, y_train], axis=1)

In [4]:
desc = {'amount_tsh': 'Total static head (amount water available to waterpoint)',
                    'date_recorded': 'The date the row was entered',
                    'funder': 'Who funded the well',
                    'gps_height': 'Altitude of the well',
                    'installer': 'Organization that installed the well',
                    'longitude': 'GPS coordinate',
                    'latitude': 'GPS coordinate',
                    'wpt_name': 'Name of the waterpoint if there is one',
                    'subvillage': 'Geographic location',
                    'region': 'Geographic location',
                    'region_code': 'Geographic location (coded)',
                    'district_code': 'Geographic location (coded)',
                    'lga': 'Geographic location',
                    'ward': 'Geographic location',
                    'population': 'Population around the well',
                    'public_meeting': 'True/False',
                    'recorded_by': 'Group entering this row of data',
                    'scheme_management': 'Who operates the waterpoint',
                    'scheme_name': 'Who operates the waterpoint',
                    'permit': 'If the waterpoint is permitted',
                    'construction_year': 'Year the waterpoint was constructed',
                    'extraction_type': 'The kind of extraction the waterpoint uses',
                    'extraction_type_group': 'The kind of extraction the waterpoint uses',
                    'extraction_type_class': 'The kind of extraction the waterpoint uses',
                    'management': 'How the waterpoint is managed',
                    'management_group': 'How the waterpoint is managed',
                    'payment': 'What the water costs',
                    'payment_type': 'What the water costs',
                    'water_quality': 'The quality of the water',
                    'quality_group': 'The quality of the water',
                    'quantity': 'The quantity of water',
                    'quantity_group': 'The quantity of water',
                    'source': 'The source of the water',
                    'source_type': 'The source of the water',
                    'source_class': 'The source of the water',
                    'waterpoint_type': 'The kind of waterpoint',
                    'waterpoint_type_group': 'The kind of waterpoint'}

In [5]:
# Eliminating null values

df.funder.fillna("Unknown", inplace=True)
df.installer.fillna("Unknown", inplace=True)
df.scheme_management.fillna("None", inplace=True)
df.permit.fillna('Unknown', inplace=True)
df.scheme_name.fillna('Unknown', inplace=True)
df.subvillage.fillna('Unknown', inplace=True)
df.public_meeting.fillna('Unknown', inplace=True)

In [60]:
len(X_train)

44550

# Defining the train and test sets

In [6]:
X = df.copy()

columns = ['amount_tsh', 'gps_height', 'population', 'region', 'lga', 
           'scheme_management', 'permit', 'construction_year',
           'extraction_type_group', 'payment', 'management', 
           'quality_group', 'quantity', 'source', 'waterpoint_type']

X = X[columns]

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
# X['district_code'] = X['district_code'].astype('object')

X_cat = X.drop(list(X.select_dtypes(['float64']).columns), axis=1)
X_numeric = X[list(X.select_dtypes(['float64']).columns)]

y = df['status_group']

X_cat = pd.get_dummies(X_cat)

X = pd.concat([X_numeric, X_cat], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),
                index = X_train.index,
                columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),
                index = X_test.index,
                columns = X_test.columns)

X_train.reset_index(inplace=True, drop=True)
y_train = y_train.reset_index(drop=True)

In [7]:
strategy = {'functional needs repair': 10000}
smote = SMOTE(sampling_strategy=strategy)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Base Model – Logistic Regression, No Regularization

In [8]:
estimator = LogisticRegression(solver='liblinear', fit_intercept=False)
reports, matrices, numpy_report, numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, estimator)

In [9]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.783544,0.200335,0.83437,0.651448,0.606083,0.760832
recall,0.700097,0.687364,0.576056,0.651448,0.654505,0.651448
f1-score,0.739417,0.310133,0.681511,0.651448,0.57702,0.686006
support,4835.8,647.0,3427.2,0.651448,8910.0,8910.0


In [10]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3385.2,1100.2,350.4
functional needs repair,160.8,444.8,41.4
non functional,774.4,678.4,1974.4


# Second Model – Decision Tree

In [11]:
dtc = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [5, 10, 20, 40],
    'min_samples_leaf': [5, 10, 20],
    'splitter': ['best', 'random']
}

gs_tree = GridSearchCV(dtc, param_grid, cv=3, verbose=10)
gs_tree.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'splitter': 'best'}

In [3]:
dtc = DecisionTreeClassifier(criterion= 'gini', max_depth= 10, min_samples_split= 5, min_samples_leaf=10, splitter='best')

NameError: name 'DecisionTreeClassifier' is not defined

In [13]:
dtc_reports, dtc_matrices, dtc_numpy_report, dtc_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, dtc)

In [14]:
dtc_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4306.6,147.2,382.0
functional needs repair,438.4,149.0,59.6
non functional,1419.4,59.0,1948.8


In [15]:
dtc_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.699036,0.426286,0.818812,0.718788,0.648045,0.725354
recall,0.89058,0.230744,0.568735,0.718788,0.563353,0.718788
f1-score,0.782882,0.295576,0.669567,0.718788,0.582675,0.703879
support,4835.8,647.0,3427.2,0.718788,8910.0,8910.0


# Third Model - K Nearest Neighbors

In [16]:
knn = KNeighborsClassifier(n_neighbors=3)

In [17]:
knn_reports, knn_matrices, knn_numpy_report, knn_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, knn)

In [20]:
knn_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3957.6,292.0,586.2
functional needs repair,305.0,250.8,91.2
non functional,859.2,151.2,2416.8


In [21]:
knn_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.772766,0.361001,0.781109,0.743569,0.638292,0.74615
recall,0.818426,0.38741,0.705188,0.743569,0.637008,0.743569
f1-score,0.794898,0.373612,0.741123,0.743569,0.636544,0.743658
support,4835.8,647.0,3427.2,0.743569,8910.0,8910.0


# Fourth Model – Bagging Classifier

In [22]:
bagged_tree = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50)

In [24]:
bagged_tree_reports, bagged_tree_matrices, bagged_tree_numpy_report, bagged_tree_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, bagged_tree)

In [29]:
bagged_tree_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4435.8,88.4,311.6
functional needs repair,412.2,157.8,77.0
non functional,1106.0,45.8,2275.4


In [30]:
bagged_tree_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.745073,0.540483,0.854064,0.770932,0.713207,0.77219
recall,0.917305,0.243837,0.663988,0.770932,0.608377,0.770932
f1-score,0.822235,0.335935,0.74705,0.770932,0.635073,0.758005
support,4835.8,647.0,3427.2,0.770932,8910.0,8910.0


# Fifth Model – Random Forest

In [25]:
forest = RandomForestClassifier()

In [26]:
forest_reports, forest_matrices, forest_numpy_report, forest_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, forest)

In [27]:
forest_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4091.4,274.6,469.8
functional needs repair,282.0,280.4,84.6
non functional,741.0,124.0,2562.2


In [28]:
forest_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.799943,0.412676,0.822103,0.778227,0.67824,0.780475
recall,0.846044,0.432957,0.747569,0.778227,0.675524,0.778227
f1-score,0.822331,0.422461,0.783052,0.778227,0.675948,0.778281
support,4835.8,647.0,3427.2,0.778227,8910.0,8910.0


# Sixth Model – XGBoost

In [41]:
xgboost_y_train_resampled = y_train_resampled.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})
xgboost_y_train = y_train.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})

xgb = XGBClassifier()

In [42]:
xgboost_reports, xgboost_matrices, xgboost_numpy_report, xgboost_numpy_matrix = cval(X_train, xgboost_y_train, X_train_resampled, xgboost_y_train_resampled, 5, xgb)

In [43]:
xgboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2387.0,123.0,917.2
functional needs repair,78.4,261.0,307.6
non functional,374.8,249.4,4211.6


In [44]:
xgboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.84041,0.412803,0.77471,0.769877,0.675975,0.773815
recall,0.69648,0.403649,0.870918,0.769877,0.657016,0.769877
f1-score,0.761679,0.407692,0.819973,0.769877,0.663115,0.767661
support,3427.2,647.0,4835.8,0.769877,8910.0,8910.0


# Eigth Model – Adaboost Classifier

In [46]:
# Instantiate an AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)

In [47]:
adaboost_reports, adaboost_matrices, adaboost_numpy_report, adaboost_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, adaboost_clf)

In [48]:
adaboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3876.0,346.8,613.0
functional needs repair,271.4,272.6,103.0
non functional,768.2,159.4,2499.6


In [49]:
adaboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.788517,0.350196,0.777344,0.74615,0.638686,0.7524
recall,0.801524,0.421628,0.729349,0.74615,0.650834,0.74615
f1-score,0.794958,0.382474,0.752574,0.74615,0.643335,0.748697
support,4835.8,647.0,3427.2,0.74615,8910.0,8910.0


# Ninth Model – Gradient Boosting Classifier

In [50]:
# Instantiate an GradientBoostingClassifier
gbt_clf = GradientBoostingClassifier(random_state=42, n_estimators=200, max_features=50)

In [51]:
gbt_reports, gbt_matrices, gbt_numpy_report, gbt_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, gbt_clf)

In [52]:
gbt_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4239.8,238.8,357.2
functional needs repair,345.0,226.2,75.8
non functional,1124.0,136.4,2166.8


In [53]:
gbt_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.742677,0.376828,0.833602,0.744422,0.651035,0.751159
recall,0.876731,0.349592,0.632231,0.744422,0.619518,0.744422
f1-score,0.804149,0.362162,0.719006,0.744422,0.628439,0.739347
support,4835.8,647.0,3427.2,0.744422,8910.0,8910.0


# Eleventh Model – Extra Randomized Trees

In [54]:
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)

In [55]:
extra_trees_reports, extra_trees_matrices, extra_trees_numpy_report, extra_trees_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, extra_trees)

In [56]:
extra_trees_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4039.0,282.8,514.0
functional needs repair,287.0,271.6,88.4
non functional,757.2,139.4,2530.6


In [57]:
extra_trees_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.794583,0.391604,0.807693,0.767811,0.664627,0.770462
recall,0.835242,0.420007,0.738373,0.767811,0.664541,0.767811
f1-score,0.814397,0.404954,0.77147,0.767811,0.663607,0.768196
support,4835.8,647.0,3427.2,0.767811,8910.0,8910.0


In [58]:
# XGboost for non functional
# BaggingClassifier for functional
# LogisticRegression for functional needs repair

# Voting Classifier

In [148]:
# X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train)

In [149]:
strategy = {'functional needs repair': int(len(X_train_2)/4)}
smote = SMOTE(sampling_strategy=strategy)

# X_train_2_resampled, y_train_2_resampled = smote.fit_resample(X_train, y_train)

In [150]:
vc_1 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50).fit(X_train_resampled, y_train_resampled)
vc_2 = XGBClassifier().fit(X_train_resampled, y_train_resampled.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2}))
vc_3 = LogisticRegression(solver='liblinear', fit_intercept=False).fit(X_train_resampled, y_train_resampled)

vc_preds_1 = vc_1.predict(X_test)
vc_preds_2 = vc_2.predict(X_test)
vc_preds_3 = vc_3.predict(X_test)

predictions_df = pd.DataFrame({'BaggingClassifier': vc_preds_1, 
                               'LogisticRegression': vc_preds_3, 
                               'XGBoost': vc_preds_2, 
                               'True Values': y_test})

In [151]:
predictions_df['XGBoost'].replace({0: 'functional', 1: 'functional needs repair', 2: 'non functional'}, inplace=True)

In [152]:
modes = []

for i in range(len(predictions_df)):
    arr = [predictions_df.BaggingClassifier.iloc[i], 
           predictions_df.LogisticRegression.iloc[i], 
           predictions_df.XGBoost.iloc[i]]
    mode = md(arr)
    modes.append(mode)

In [153]:
predictions_df['mode'] = modes

In [154]:
predictions_df

Unnamed: 0,BaggingClassifier,LogisticRegression,XGBoost,True Values,mode
41646,functional,functional,non functional,non functional,functional
18556,functional,functional needs repair,non functional,functional,functional
5861,functional,functional needs repair,non functional,functional,functional
36242,functional,functional,non functional,functional,functional
17685,functional,non functional,functional,non functional,functional
...,...,...,...,...,...
7663,non functional,non functional,functional,non functional,non functional
40863,functional,functional,non functional,functional,functional
52733,functional,functional needs repair,non functional,functional needs repair,functional
28391,functional,functional,non functional,functional,functional


In [155]:
pd.DataFrame(classification_report(y_test, predictions_df['mode'], output_dict=True))

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.743919,0.384547,0.846463,0.745859,0.65831,0.757012
recall,0.874381,0.400185,0.629044,0.745859,0.634537,0.745859
f1-score,0.803891,0.39221,0.721735,0.745859,0.639279,0.742427
support,8080.0,1082.0,5688.0,0.745859,14850.0,14850.0


In [156]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test, predictions_df['mode']), columns=labels, index=labels)

Unnamed: 0,functional,functional needs repair,non functional
functional,7065,477,538
functional needs repair,538,433,111
non functional,1894,216,3578


# Stacking Classifier

In [101]:
stck = StackingClassifier(estimators = [('logistic_regression', LogisticRegression(solver='liblinear', fit_intercept=False)), 
                                       ('bagging_classifier', BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50))], 
                          final_estimator = XGBClassifier())

In [102]:
stck.fit(X_train_resampled, y_train_resampled)

In [103]:
stck_preds = stck.predict(X_test)

In [104]:
pd.DataFrame(classification_report(y_test, stck_preds, output_dict=True))

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.885877,0.60479,0.93961,0.878165,0.810093,0.885587
recall,0.926696,0.756554,0.828814,0.878165,0.837355,0.878165
f1-score,0.905827,0.672213,0.880741,0.878165,0.819594,0.879724
support,6207.0,801.0,4130.0,0.878165,11138.0,11138.0


In [105]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test, stck_preds), columns=labels, index=labels)

Unnamed: 0,functional,functional needs repair,non functional
functional,5752,273,182
functional needs repair,157,606,38
non functional,584,123,3423
