In [89]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from sklearn.ensemble import StackingClassifier

In [10]:
def cval(X, y, X_resampled, y_resampled, cval, estimator):
    
    reports = []
    matrices = []
    numpy_reports = []
    numpy_matrices = []
    
    report_columns = ['functional', 'functional needs repair', 
                      'non functional', 'accuracy', 'macro avg', 
                      'weighted avg']
    
    report_rows = ['precision', 'recall', 
                   'f1-score', 'support']
    
    matrix_labels = ['functional', 'functional needs repair', 
                     'non functional']
    
    idx = list(X.index)
    np.random.shuffle(idx)
    
    for i in list(range(cval)):
        arrs = np.array_split(idx, cval)
        
        test = arrs.pop(i)
        train = np.concatenate(arrs)
        
        test_x = X.take(test)
        train_x = X.take(train)
        test_y = y.take(test)
        train_y = y.take(train)
        
        smote = SMOTE()
        train_x_resampled, train_y_resampled = smote.fit_resample(train_x, train_y)
        
        model = estimator
        model.fit(train_x_resampled, train_y_resampled)
        preds = model.predict(test_x)
        
        report = pd.DataFrame(classification_report(test_y, preds, output_dict=True))
        
        reports.append(report)
        numpy_reports.append(np.array(report))
        
        matrix = pd.DataFrame(confusion_matrix(test_y, preds))
        matrices.append(matrix)
        numpy_matrices.append(np.array(matrix))
    
    numpy_report = pd.DataFrame(np.sum(numpy_reports, axis=0)/cval, 
                                columns=report_columns, index=report_rows)
    
    numpy_matrix = pd.DataFrame(np.sum(numpy_matrices, axis=0)/cval, 
                                columns=matrix_labels, index=matrix_labels)
    
    return reports, matrices, numpy_report, numpy_matrix

In [11]:
X_test = pd.read_csv("tanzanian_water_wells/X_test.csv")
X_train = pd.read_csv("tanzanian_water_wells/X_train.csv")
y_train = pd.read_csv("tanzanian_water_wells/y_train.csv")

df = pd.concat([X_train, y_train], axis=1)

In [12]:
desc = {'amount_tsh': 'Total static head (amount water available to waterpoint)',
                    'date_recorded': 'The date the row was entered',
                    'funder': 'Who funded the well',
                    'gps_height': 'Altitude of the well',
                    'installer': 'Organization that installed the well',
                    'longitude': 'GPS coordinate',
                    'latitude': 'GPS coordinate',
                    'wpt_name': 'Name of the waterpoint if there is one',
                    'subvillage': 'Geographic location',
                    'region': 'Geographic location',
                    'region_code': 'Geographic location (coded)',
                    'district_code': 'Geographic location (coded)',
                    'lga': 'Geographic location',
                    'ward': 'Geographic location',
                    'population': 'Population around the well',
                    'public_meeting': 'True/False',
                    'recorded_by': 'Group entering this row of data',
                    'scheme_management': 'Who operates the waterpoint',
                    'scheme_name': 'Who operates the waterpoint',
                    'permit': 'If the waterpoint is permitted',
                    'construction_year': 'Year the waterpoint was constructed',
                    'extraction_type': 'The kind of extraction the waterpoint uses',
                    'extraction_type_group': 'The kind of extraction the waterpoint uses',
                    'extraction_type_class': 'The kind of extraction the waterpoint uses',
                    'management': 'How the waterpoint is managed',
                    'management_group': 'How the waterpoint is managed',
                    'payment': 'What the water costs',
                    'payment_type': 'What the water costs',
                    'water_quality': 'The quality of the water',
                    'quality_group': 'The quality of the water',
                    'quantity': 'The quantity of water',
                    'quantity_group': 'The quantity of water',
                    'source': 'The source of the water',
                    'source_type': 'The source of the water',
                    'source_class': 'The source of the water',
                    'waterpoint_type': 'The kind of waterpoint',
                    'waterpoint_type_group': 'The kind of waterpoint'}

In [13]:
# Eliminating null values

df.funder.fillna("Unknown", inplace=True)
df.installer.fillna("Unknown", inplace=True)
df.scheme_management.fillna("None", inplace=True)
df.permit.fillna('Unknown', inplace=True)
df.scheme_name.fillna('Unknown', inplace=True)
df.subvillage.fillna('Unknown', inplace=True)
df.public_meeting.fillna('Unknown', inplace=True)

# Defining the train and test sets

In [14]:
X = df.copy()

columns = ['amount_tsh', 'gps_height', 'population', 'region', 'lga', 
           'scheme_management', 'permit', 'construction_year',
           'extraction_type_group', 'payment', 'management', 
           'quality_group', 'quantity', 'source', 'waterpoint_type']

X = X[columns]

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
# X['district_code'] = X['district_code'].astype('object')

X_cat = X.drop(list(X.select_dtypes(['float64']).columns), axis=1)
X_numeric = X[list(X.select_dtypes(['float64']).columns)]

y = df['status_group']

X_cat = pd.get_dummies(X_cat)

X = pd.concat([X_numeric, X_cat], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),
                index = X_train.index,
                columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),
                index = X_test.index,
                columns = X_test.columns)

X_train.reset_index(inplace=True, drop=True)
y_train = y_train.reset_index(drop=True)

In [15]:
X_train_resampled = []
y_train_resampled = []

# Base Model – Logistic Regression, No Regularization

In [16]:
estimator = LogisticRegression(solver='liblinear', fit_intercept=False)
reports, matrices, numpy_report, numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, estimator)

In [17]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.8217,0.161865,0.796776,0.571448,0.593447,0.764292
recall,0.51963,0.821183,0.597427,0.571448,0.64608,0.571448
f1-score,0.636487,0.270353,0.68273,0.571448,0.529857,0.627751
support,4836.0,646.4,3427.6,0.571448,8910.0,8910.0


In [20]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2513.2,1845.8,477.0
functional needs repair,69.6,530.8,46.0
non functional,475.8,904.2,2047.6


# Second Model – Decision Tree

In [21]:
dtc = DecisionTreeClassifier()

In [22]:
dtc_reports, dtc_matrices, dtc_numpy_report, dtc_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, dtc)

In [23]:
dtc_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3648.0,499.4,688.6
functional needs repair,237.4,303.0,106.0
non functional,684.0,197.2,2546.4


In [24]:
dtc_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.798358,0.303333,0.762156,0.729226,0.621282,0.748551
recall,0.75433,0.468551,0.742833,0.729226,0.655238,0.729226
f1-score,0.775715,0.368077,0.752351,0.729226,0.632048,0.737201
support,4836.0,646.4,3427.6,0.729226,8910.0,8910.0


# Third Model - K Nearest Neighbors

In [25]:
knn = KNeighborsClassifier(n_neighbors=3)

In [26]:
knn_reports, knn_matrices, knn_numpy_report, knn_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, knn)

In [27]:
knn_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3763.0,413.6,659.4
functional needs repair,269.4,292.6,84.4
non functional,789.0,189.0,2449.6


In [28]:
knn_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.780507,0.326653,0.767054,0.730101,0.624738,0.742466
recall,0.778111,0.452669,0.714707,0.730101,0.648496,0.730101
f1-score,0.779281,0.379192,0.739941,0.730101,0.632805,0.735151
support,4836.0,646.4,3427.6,0.730101,8910.0,8910.0


# Fourth Model – Bagging Classifier

In [29]:
bagged_tree = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50)

In [30]:
bagged_tree_reports, bagged_tree_matrices, bagged_tree_numpy_report, bagged_tree_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, bagged_tree)

In [31]:
bagged_tree_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3795.2,520.2,520.6
functional needs repair,225.8,325.6,95.0
non functional,720.6,199.0,2508.0


In [32]:
bagged_tree_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.800437,0.311584,0.802959,0.743973,0.638327,0.765997
recall,0.784772,0.503821,0.731707,0.743973,0.673433,0.743973
f1-score,0.792504,0.384954,0.765616,0.743973,0.647691,0.752626
support,4836.0,646.4,3427.6,0.743973,8910.0,8910.0


# Fifth Model – Random Forest

In [33]:
forest = RandomForestClassifier()

In [34]:
forest_reports, forest_matrices, forest_numpy_report, forest_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, forest)

In [35]:
forest_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3850.4,449.6,536.0
functional needs repair,229.0,322.6,94.8
non functional,647.8,178.8,2601.0


In [36]:
forest_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.814577,0.338619,0.804829,0.760269,0.652675,0.776311
recall,0.796175,0.498211,0.758835,0.760269,0.684407,0.760269
f1-score,0.805251,0.403011,0.781143,0.760269,0.663135,0.766833
support,4836.0,646.4,3427.6,0.760269,8910.0,8910.0


# Sixth Model – XGBoost

In [39]:
xgboost_y_train = y_train.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})

xgb = XGBClassifier()

In [40]:
xgboost_reports, xgboost_matrices, xgboost_numpy_report, xgboost_numpy_matrix = cval(X_train, xgboost_y_train, X_train_resampled, y_train_resampled, 5, xgb)

In [41]:
xgboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2478.8,233.6,715.2
functional needs repair,79.6,367.4,199.4
non functional,472.2,623.0,3740.8


In [42]:
xgboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.818,0.300408,0.803569,0.739282,0.640659,0.772608
recall,0.723146,0.569017,0.773513,0.739282,0.688559,0.739282
f1-score,0.767629,0.393057,0.788235,0.739282,0.649641,0.751639
support,3427.6,646.4,4836.0,0.739282,8910.0,8910.0


# Eigth Model – Adaboost Classifier

In [45]:
# Instantiate an AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)

In [46]:
adaboost_reports, adaboost_matrices, adaboost_numpy_report, adaboost_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, adaboost_clf)

In [47]:
adaboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3705.0,485.4,645.6
functional needs repair,234.2,311.0,101.2
non functional,681.6,198.0,2548.0


In [48]:
adaboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.80181,0.312654,0.773408,0.7367,0.629291,0.755441
recall,0.766121,0.480593,0.743372,0.7367,0.663362,0.7367
f1-score,0.783524,0.378655,0.758024,0.7367,0.640068,0.744386
support,4836.0,646.4,3427.6,0.7367,8910.0,8910.0


# Ninth Model – Gradient Boosting Classifier

In [49]:
# Instantiate an GradientBoostingClassifier
gbt_clf = GradientBoostingClassifier(random_state=42, n_estimators=200, max_features=50)

In [50]:
gbt_reports, gbt_matrices, gbt_numpy_report, gbt_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, gbt_clf)

In [51]:
gbt_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3539.0,809.0,488.0
functional needs repair,184.8,384.6,77.0
non functional,768.0,371.4,2288.2


In [52]:
gbt_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.787877,0.245773,0.801978,0.697172,0.611876,0.754015
recall,0.731794,0.594799,0.667625,0.697172,0.664739,0.697172
f1-score,0.758796,0.347725,0.728615,0.697172,0.611712,0.717391
support,4836.0,646.4,3427.6,0.697172,8910.0,8910.0


# Eleventh Model – Extra Randomized Trees

In [53]:
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)

In [54]:
extra_trees_reports, extra_trees_matrices, extra_trees_numpy_report, extra_trees_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, extra_trees)

In [55]:
extra_trees_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3825.2,441.2,569.6
functional needs repair,230.4,322.6,93.4
non functional,659.8,179.4,2588.4


In [56]:
extra_trees_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.811226,0.342054,0.796102,0.756027,0.649794,0.771415
recall,0.791003,0.499332,0.755213,0.756027,0.681849,0.756027
f1-score,0.80096,0.405773,0.775094,0.756027,0.660609,0.762342
support,4836.0,646.4,3427.6,0.756027,8910.0,8910.0


In [57]:
# XGboost for non functional
# RandomForest for functional
# LogisticRegression for functional needs repair

# Voting Classifier

In [78]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train)

In [79]:
smote = SMOTE()
X_train_2_resampled, y_train_2_resampled = smote.fit_resample(X_train, y_train)

In [80]:
vc_1 = RandomForestClassifier().fit(X_train_2_resampled, y_train_2_resampled)
vc_2 = XGBClassifier().fit(X_train_2_resampled, y_train_2_resampled.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2}))
vc_3 = LogisticRegression(solver='liblinear', fit_intercept=False).fit(X_train_2_resampled, y_train_2_resampled)

vc_preds_1 = vc_1.predict(X_test_2)
vc_preds_2 = vc_2.predict(X_test_2)
vc_preds_3 = vc_3.predict(X_test_2)

predictions_df = pd.DataFrame({'BaggingClassifier': vc_preds_1, 
                               'LogisticRegression': vc_preds_3, 
                               'XGBoost': vc_preds_2, 
                               'True Values': y_test_2})

In [81]:
predictions_df['XGBoost'].replace({0: 'functional', 1: 'functional needs repair', 2: 'non functional'}, inplace=True)

In [82]:
modes = []

for i in range(len(predictions_df)):
    arr = [predictions_df.BaggingClassifier.iloc[i], 
           predictions_df.LogisticRegression.iloc[i], 
           predictions_df.XGBoost.iloc[i]]
    mode = md(arr)
    modes.append(mode)

In [83]:
predictions_df['mode'] = modes

In [84]:
predictions_df

Unnamed: 0,BaggingClassifier,LogisticRegression,XGBoost,True Values,mode
42688,non functional,non functional,functional,non functional,non functional
34624,non functional,non functional,functional,non functional,non functional
35506,non functional,functional,non functional,non functional,non functional
12391,functional needs repair,functional needs repair,functional needs repair,functional needs repair,functional needs repair
18382,functional needs repair,functional needs repair,non functional,functional needs repair,functional needs repair
...,...,...,...,...,...
39628,non functional,non functional,functional,non functional,non functional
33005,functional needs repair,functional needs repair,non functional,functional needs repair,functional needs repair
25304,functional,functional,non functional,functional,functional
19308,non functional,non functional,functional,non functional,non functional


In [85]:
pd.DataFrame(classification_report(y_test_2, predictions_df['mode'], output_dict=True))

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.924898,0.399468,0.893788,0.824744,0.739385,0.874887
recall,0.793276,0.93408,0.848128,0.824744,0.858494,0.824744
f1-score,0.854045,0.559613,0.870359,0.824744,0.761339,0.839128
support,6008.0,804.0,4326.0,0.824744,11138.0,11138.0


In [86]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, predictions_df['mode']), columns=labels, index=labels)

Unnamed: 0,functional,functional needs repair,non functional
functional,4766,822,420
functional needs repair,37,751,16
non functional,350,307,3669


# Stacking Combo 3 (Winner!)

In [90]:
stck = StackingClassifier(estimators = [('logistic_regression', LogisticRegression(solver='liblinear', fit_intercept=False)), 
                                       ('bagging_classifier', BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50))], 
                          final_estimator = XGBClassifier())

In [91]:
stck.fit(X_train_2_resampled, y_train_2_resampled)

In [92]:
stck_preds = stck.predict(X_test_2)

In [93]:
pd.DataFrame(classification_report(y_test_2, stck_preds, output_dict=True))

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.903637,0.530864,0.918792,0.868468,0.784431,0.882614
recall,0.884987,0.802239,0.857836,0.868468,0.848354,0.868468
f1-score,0.894215,0.63893,0.887268,0.868468,0.806804,0.873089
support,6008.0,804.0,4326.0,0.868468,11138.0,11138.0


In [94]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, stck_preds), columns=labels, index=labels)

Unnamed: 0,functional,functional needs repair,non functional
functional,5317,410,281
functional needs repair,112,645,47
non functional,455,160,3711
