In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md

In [42]:
def cval(X, y, X_resampled, y_resampled, cval, estimator):
    
    reports = []
    matrices = []
    numpy_reports = []
    numpy_matrices = []
    
    report_columns = ['functional', 'functional needs repair', 
                      'non functional', 'accuracy', 'macro avg', 
                      'weighted avg']
    
    report_rows = ['precision', 'recall', 
                   'f1-score', 'support']
    
    matrix_labels = ['functional', 'functional needs repair', 
                     'non functional']
    
    idx = list(X.index)
    np.random.shuffle(idx)
    
    for i in list(range(cval)):
        arrs = np.array_split(idx, cval)
        
        test = arrs.pop(i)
        train = np.concatenate(arrs)
        
        test_x = X.take(test)
        train_x = X.take(train)
        test_y = y.take(test)
        train_y = y.take(train)
        
        strategy = {'functional needs repair': int((len(train_x))/3)}
        smote = SMOTE(sampling_strategy=strategy)
        train_x_resampled, train_y_resampled = smote.fit_resample(train_x, train_y)
        
        model = estimator
        model.fit(train_x_resampled, train_y_resampled)
        preds = model.predict(test_x)
        
        report = pd.DataFrame(classification_report(test_y, preds, output_dict=True))
        
        reports.append(report)
        numpy_reports.append(np.array(report))
        
        matrix = pd.DataFrame(confusion_matrix(test_y, preds))
        matrices.append(matrix)
        numpy_matrices.append(np.array(matrix))
    
    numpy_report = pd.DataFrame(np.sum(numpy_reports, axis=0)/cval, 
                                columns=report_columns, index=report_rows)
    
    numpy_matrix = pd.DataFrame(np.sum(numpy_matrices, axis=0)/cval, 
                                columns=matrix_labels, index=matrix_labels)
    
    return reports, matrices, numpy_report, numpy_matrix

In [4]:
X_test = pd.read_csv("tanzanian_water_wells/X_test.csv")
X_train = pd.read_csv("tanzanian_water_wells/X_train.csv")
y_train = pd.read_csv("tanzanian_water_wells/y_train.csv")

df = pd.concat([X_train, y_train], axis=1)

In [5]:
desc = {'amount_tsh': 'Total static head (amount water available to waterpoint)',
                    'date_recorded': 'The date the row was entered',
                    'funder': 'Who funded the well',
                    'gps_height': 'Altitude of the well',
                    'installer': 'Organization that installed the well',
                    'longitude': 'GPS coordinate',
                    'latitude': 'GPS coordinate',
                    'wpt_name': 'Name of the waterpoint if there is one',
                    'subvillage': 'Geographic location',
                    'region': 'Geographic location',
                    'region_code': 'Geographic location (coded)',
                    'district_code': 'Geographic location (coded)',
                    'lga': 'Geographic location',
                    'ward': 'Geographic location',
                    'population': 'Population around the well',
                    'public_meeting': 'True/False',
                    'recorded_by': 'Group entering this row of data',
                    'scheme_management': 'Who operates the waterpoint',
                    'scheme_name': 'Who operates the waterpoint',
                    'permit': 'If the waterpoint is permitted',
                    'construction_year': 'Year the waterpoint was constructed',
                    'extraction_type': 'The kind of extraction the waterpoint uses',
                    'extraction_type_group': 'The kind of extraction the waterpoint uses',
                    'extraction_type_class': 'The kind of extraction the waterpoint uses',
                    'management': 'How the waterpoint is managed',
                    'management_group': 'How the waterpoint is managed',
                    'payment': 'What the water costs',
                    'payment_type': 'What the water costs',
                    'water_quality': 'The quality of the water',
                    'quality_group': 'The quality of the water',
                    'quantity': 'The quantity of water',
                    'quantity_group': 'The quantity of water',
                    'source': 'The source of the water',
                    'source_type': 'The source of the water',
                    'source_class': 'The source of the water',
                    'waterpoint_type': 'The kind of waterpoint',
                    'waterpoint_type_group': 'The kind of waterpoint'}

In [6]:
# Eliminating null values

df.funder.fillna("Unknown", inplace=True)
df.installer.fillna("Unknown", inplace=True)
df.scheme_management.fillna("None", inplace=True)
df.permit.fillna('Unknown', inplace=True)
df.scheme_name.fillna('Unknown', inplace=True)
df.subvillage.fillna('Unknown', inplace=True)
df.public_meeting.fillna('Unknown', inplace=True)

# Defining the train and test sets

In [7]:
X = df.copy()

columns = ['amount_tsh', 'gps_height', 'population', 'region', 'lga', 
           'scheme_management', 'permit', 'construction_year',
           'extraction_type_group', 'payment', 'management', 
           'quality_group', 'quantity', 'source', 'waterpoint_type']

X = X[columns]

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
# X['district_code'] = X['district_code'].astype('object')

X_cat = X.drop(list(X.select_dtypes(['float64']).columns), axis=1)
X_numeric = X[list(X.select_dtypes(['float64']).columns)]

y = df['status_group']

X_cat = pd.get_dummies(X_cat)

X = pd.concat([X_numeric, X_cat], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),
                index = X_train.index,
                columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),
                index = X_test.index,
                columns = X_test.columns)

X_train.reset_index(inplace=True, drop=True)
y_train = y_train.reset_index(drop=True)

In [8]:
X_train_resampled = []
y_train_resampled = []

# Base Model – Logistic Regression, No Regularization

In [9]:
estimator = LogisticRegression(solver='liblinear', fit_intercept=False)
reports, matrices, numpy_report, numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, estimator)

In [10]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.789665,0.187338,0.842939,0.629136,0.606647,0.766292
recall,0.661616,0.74182,0.561557,0.629136,0.654998,0.629136
f1-score,0.719756,0.298833,0.674037,0.629136,0.564209,0.671701
support,4859.6,647.6,3402.8,0.629136,8910.0,8910.0


In [13]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3214.4,1324.8,320.4
functional needs repair,131.6,480.2,35.8
non functional,725.4,766.4,1911.0


# Second Model – Decision Tree

In [14]:
dtc = DecisionTreeClassifier()

In [15]:
dtc_reports, dtc_matrices, dtc_numpy_report, dtc_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, dtc)

In [16]:
dtc_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3811.6,430.0,618.0
functional needs repair,267.6,289.4,90.6
non functional,763.2,186.8,2452.8


In [17]:
dtc_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.787114,0.319449,0.775981,0.735556,0.627515,0.748924
recall,0.784309,0.447118,0.720804,0.735556,0.650743,0.735556
f1-score,0.785696,0.372403,0.747331,0.735556,0.635144,0.741044
support,4859.6,647.6,3402.8,0.735556,8910.0,8910.0


# Third Model - K Nearest Neighbors

In [18]:
knn = KNeighborsClassifier(n_neighbors=3)

In [19]:
knn_reports, knn_matrices, knn_numpy_report, knn_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, knn)

In [20]:
knn_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3930.4,370.6,558.6
functional needs repair,297.2,262.4,88.0
non functional,855.8,170.6,2376.4


In [21]:
knn_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.773254,0.326704,0.78626,0.737284,0.628739,0.74588
recall,0.808875,0.405531,0.698379,0.737284,0.637595,0.737284
f1-score,0.790573,0.361493,0.73961,0.737284,0.630559,0.73995
support,4859.6,647.6,3402.8,0.737284,8910.0,8910.0


# Fourth Model – Bagging Classifier

In [22]:
bagged_tree = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50)

In [23]:
bagged_tree_reports, bagged_tree_matrices, bagged_tree_numpy_report, bagged_tree_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, bagged_tree)

In [24]:
bagged_tree_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4334.2,241.2,284.2
functional needs repair,328.6,247.4,71.6
non functional,1063.6,122.0,2217.2


In [25]:
bagged_tree_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.75693,0.404199,0.861808,0.763053,0.674312,0.771453
recall,0.891883,0.380825,0.651728,0.763053,0.641479,0.763053
f1-score,0.818837,0.391848,0.742096,0.763053,0.650927,0.758558
support,4859.6,647.6,3402.8,0.763053,8910.0,8910.0


# Fifth Model – Random Forest

In [26]:
forest = RandomForestClassifier()

In [27]:
forest_reports, forest_matrices, forest_numpy_report, forest_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, forest)

In [28]:
forest_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4033.8,363.2,462.6
functional needs repair,270.2,297.2,80.2
non functional,715.4,153.2,2534.2


In [29]:
forest_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.803636,0.365338,0.823569,0.770505,0.664181,0.77943
recall,0.83006,0.459293,0.744736,0.770505,0.67803,0.770505
f1-score,0.816622,0.406821,0.782167,0.770505,0.668537,0.77369
support,4859.6,647.6,3402.8,0.770505,8910.0,8910.0


# Sixth Model – XGBoost

In [34]:
xgboost_y_train = y_train.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})

xgb = XGBClassifier()

In [35]:
xgboost_reports, xgboost_matrices, xgboost_numpy_report, xgboost_numpy_matrix = cval(X_train, xgboost_y_train, X_train_resampled, y_train_resampled, 5, xgb)

In [36]:
xgboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2359.2,179.0,864.6
functional needs repair,76.8,302.2,268.6
non functional,367.8,383.2,4108.6


In [37]:
xgboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.841519,0.349979,0.783769,0.75982,0.658423,0.774304
recall,0.693376,0.46663,0.845409,0.75982,0.668472,0.75982
f1-score,0.760277,0.39987,0.813404,0.75982,0.657851,0.763062
support,3402.8,647.6,4859.6,0.75982,8910.0,8910.0


# Eigth Model – Adaboost Classifier

In [43]:
# Instantiate an AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)

In [44]:
adaboost_reports, adaboost_matrices, adaboost_numpy_report, adaboost_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, adaboost_clf)

In [45]:
adaboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3807.4,433.8,618.4
functional needs repair,257.8,294.2,95.6
non functional,740.6,189.6,2472.6


In [46]:
adaboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.792243,0.320716,0.775983,0.737845,0.629648,0.75179
recall,0.783483,0.454601,0.726635,0.737845,0.654906,0.737845
f1-score,0.787829,0.375802,0.750487,0.737845,0.638039,0.743633
support,4859.6,647.6,3402.8,0.737845,8910.0,8910.0


# Ninth Model – Gradient Boosting Classifier

In [47]:
# Instantiate an GradientBoostingClassifier
gbt_clf = GradientBoostingClassifier(random_state=42, n_estimators=200, max_features=50)

In [48]:
gbt_reports, gbt_matrices, gbt_numpy_report, gbt_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, gbt_clf)

In [49]:
gbt_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4140.2,399.2,320.2
functional needs repair,292.2,288.8,66.6
non functional,1040.4,236.6,2125.8


In [50]:
gbt_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.756531,0.312384,0.845936,0.735668,0.638284,0.75843
recall,0.851969,0.446538,0.624631,0.735668,0.641046,0.735668
f1-score,0.801404,0.367422,0.718613,0.735668,0.629146,0.738265
support,4859.6,647.6,3402.8,0.735668,8910.0,8910.0


# Eleventh Model – Extra Randomized Trees

In [51]:
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)

In [52]:
extra_trees_reports, extra_trees_matrices, extra_trees_numpy_report, extra_trees_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, extra_trees)

In [53]:
extra_trees_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3963.8,383.2,512.6
functional needs repair,269.2,295.0,83.4
non functional,734.0,160.4,2508.4


In [54]:
extra_trees_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.798044,0.352109,0.807992,0.759506,0.652715,0.769587
recall,0.815765,0.456422,0.737147,0.759506,0.669778,0.759506
f1-score,0.806746,0.396737,0.770935,0.759506,0.658139,0.763306
support,4859.6,647.6,3402.8,0.759506,8910.0,8910.0


In [55]:
# XGboost for non functional
# BaggingClassifier for functional
# LogisticRegression for functional needs repair

# Voting Classifier

In [56]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train)

In [57]:
strategy = {'functional needs repair': int(len(X_train_2)/3)}
smote = SMOTE(sampling_strategy=strategy)

X_train_2_resampled, y_train_2_resampled = smote.fit_resample(X_train, y_train)

In [58]:
vc_1 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50).fit(X_train_2_resampled, y_train_2_resampled)
vc_2 = XGBClassifier().fit(X_train_2_resampled, y_train_2_resampled.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2}))
vc_3 = LogisticRegression(solver='liblinear', fit_intercept=False).fit(X_train_2_resampled, y_train_2_resampled)

vc_preds_1 = vc_1.predict(X_test_2)
vc_preds_2 = vc_2.predict(X_test_2)
vc_preds_3 = vc_3.predict(X_test_2)

predictions_df = pd.DataFrame({'BaggingClassifier': vc_preds_1, 
                               'LogisticRegression': vc_preds_3, 
                               'XGBoost': vc_preds_2, 
                               'True Values': y_test_2})

In [59]:
predictions_df['XGBoost'].replace({0: 'functional', 1: 'functional needs repair', 2: 'non functional'}, inplace=True)

In [60]:
modes = []

for i in range(len(predictions_df)):
    arr = [predictions_df.BaggingClassifier.iloc[i], 
           predictions_df.LogisticRegression.iloc[i], 
           predictions_df.XGBoost.iloc[i]]
    mode = md(arr)
    modes.append(mode)

In [61]:
predictions_df['mode'] = modes

In [62]:
predictions_df

Unnamed: 0,BaggingClassifier,LogisticRegression,XGBoost,True Values,mode
8811,non functional,functional needs repair,functional needs repair,non functional,functional needs repair
6744,non functional,non functional,functional,non functional,non functional
35985,functional,functional needs repair,non functional,functional,functional
36763,functional,functional,non functional,functional,functional
30615,functional,functional needs repair,non functional,functional,functional
...,...,...,...,...,...
23082,functional,functional,non functional,functional,functional
40630,non functional,functional needs repair,functional,non functional,non functional
39632,non functional,functional needs repair,functional,non functional,non functional
38334,functional,functional,non functional,functional,functional


In [63]:
pd.DataFrame(classification_report(y_test_2, predictions_df['mode'], output_dict=True))

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.828593,0.508358,0.910159,0.824295,0.749036,0.835565
recall,0.900178,0.635916,0.748069,0.824295,0.761388,0.824295
f1-score,0.862903,0.565027,0.821192,0.824295,0.749708,0.825641
support,6181.0,813.0,4144.0,0.824295,11138.0,11138.0


In [64]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, predictions_df['mode']), columns=labels, index=labels)

Unnamed: 0,functional,functional needs repair,non functional
functional,5564,349,268
functional needs repair,258,517,38
non functional,893,151,3100
