In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md

In [41]:
def cval(X, y, X_resampled, y_resampled, cval, estimator):
    
    reports = []
    matrices = []
    numpy_reports = []
    numpy_matrices = []
    
    report_columns = ['functional', 'functional needs repair', 
                      'non functional', 'accuracy', 'macro avg', 
                      'weighted avg']
    
    report_rows = ['precision', 'recall', 
                   'f1-score', 'support']
    
    matrix_labels = ['functional', 'functional needs repair', 
                     'non functional']
    
    idx = list(X.index)
    np.random.shuffle(idx)
    
    for i in list(range(cval)):
        arrs = np.array_split(idx, cval)
        
        test = arrs.pop(i)
        train = np.concatenate(arrs)
        
        test_x = X.take(test)
        train_x = X.take(train)
        test_y = y.take(test)
        train_y = y.take(train)
        
        strategy = {'functional needs repair': int((len(train_x))/2)}
        smote = SMOTE(sampling_strategy=strategy)
        train_x_resampled, train_y_resampled = smote.fit_resample(train_x, train_y)
        
        model = estimator
        model.fit(train_x_resampled, train_y_resampled)
        preds = model.predict(test_x)
        
        report = pd.DataFrame(classification_report(test_y, preds, output_dict=True))
        
        reports.append(report)
        numpy_reports.append(np.array(report))
        
        matrix = pd.DataFrame(confusion_matrix(test_y, preds))
        matrices.append(matrix)
        numpy_matrices.append(np.array(matrix))
    
    numpy_report = pd.DataFrame(np.sum(numpy_reports, axis=0)/cval, 
                                columns=report_columns, index=report_rows)
    
    numpy_matrix = pd.DataFrame(np.sum(numpy_matrices, axis=0)/cval, 
                                columns=matrix_labels, index=matrix_labels)
    
    return reports, matrices, numpy_report, numpy_matrix

In [3]:
X_test = pd.read_csv("tanzanian_water_wells/X_test.csv")
X_train = pd.read_csv("tanzanian_water_wells/X_train.csv")
y_train = pd.read_csv("tanzanian_water_wells/y_train.csv")

df = pd.concat([X_train, y_train], axis=1)

In [4]:
desc = {'amount_tsh': 'Total static head (amount water available to waterpoint)',
                    'date_recorded': 'The date the row was entered',
                    'funder': 'Who funded the well',
                    'gps_height': 'Altitude of the well',
                    'installer': 'Organization that installed the well',
                    'longitude': 'GPS coordinate',
                    'latitude': 'GPS coordinate',
                    'wpt_name': 'Name of the waterpoint if there is one',
                    'subvillage': 'Geographic location',
                    'region': 'Geographic location',
                    'region_code': 'Geographic location (coded)',
                    'district_code': 'Geographic location (coded)',
                    'lga': 'Geographic location',
                    'ward': 'Geographic location',
                    'population': 'Population around the well',
                    'public_meeting': 'True/False',
                    'recorded_by': 'Group entering this row of data',
                    'scheme_management': 'Who operates the waterpoint',
                    'scheme_name': 'Who operates the waterpoint',
                    'permit': 'If the waterpoint is permitted',
                    'construction_year': 'Year the waterpoint was constructed',
                    'extraction_type': 'The kind of extraction the waterpoint uses',
                    'extraction_type_group': 'The kind of extraction the waterpoint uses',
                    'extraction_type_class': 'The kind of extraction the waterpoint uses',
                    'management': 'How the waterpoint is managed',
                    'management_group': 'How the waterpoint is managed',
                    'payment': 'What the water costs',
                    'payment_type': 'What the water costs',
                    'water_quality': 'The quality of the water',
                    'quality_group': 'The quality of the water',
                    'quantity': 'The quantity of water',
                    'quantity_group': 'The quantity of water',
                    'source': 'The source of the water',
                    'source_type': 'The source of the water',
                    'source_class': 'The source of the water',
                    'waterpoint_type': 'The kind of waterpoint',
                    'waterpoint_type_group': 'The kind of waterpoint'}

In [5]:
# Eliminating null values

df.funder.fillna("Unknown", inplace=True)
df.installer.fillna("Unknown", inplace=True)
df.scheme_management.fillna("None", inplace=True)
df.permit.fillna('Unknown', inplace=True)
df.scheme_name.fillna('Unknown', inplace=True)
df.subvillage.fillna('Unknown', inplace=True)
df.public_meeting.fillna('Unknown', inplace=True)

# Defining the train and test sets

In [6]:
X = df.copy()

columns = ['amount_tsh', 'gps_height', 'population', 'region', 'lga', 
           'scheme_management', 'permit', 'construction_year',
           'extraction_type_group', 'payment', 'management', 
           'quality_group', 'quantity', 'source', 'waterpoint_type']

X = X[columns]

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
# X['district_code'] = X['district_code'].astype('object')

X_cat = X.drop(list(X.select_dtypes(['float64']).columns), axis=1)
X_numeric = X[list(X.select_dtypes(['float64']).columns)]

y = df['status_group']

X_cat = pd.get_dummies(X_cat)

X = pd.concat([X_numeric, X_cat], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),
                index = X_train.index,
                columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),
                index = X_test.index,
                columns = X_test.columns)

X_train.reset_index(inplace=True, drop=True)
y_train = y_train.reset_index(drop=True)

In [7]:
X_train_resampled = []
y_train_resampled = []

# Base Model – Logistic Regression, No Regularization

In [8]:
estimator = LogisticRegression(solver='liblinear', fit_intercept=False)
reports, matrices, numpy_report, numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, estimator)

In [9]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.801955,0.164558,0.839798,0.580516,0.602104,0.770494
recall,0.568956,0.824279,0.551151,0.580516,0.648129,0.580516
f1-score,0.665577,0.27428,0.665347,0.580516,0.535068,0.63718
support,4840.2,644.2,3425.6,0.580516,8910.0,8910.0


In [12]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2754.0,1755.4,330.8
functional needs repair,83.2,531.2,29.8
non functional,596.8,941.6,1887.2


# Second Model – Decision Tree

In [13]:
dtc = DecisionTreeClassifier()

In [14]:
dtc_reports, dtc_matrices, dtc_numpy_report, dtc_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, dtc)

In [15]:
dtc_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3722.0,494.0,624.2
functional needs repair,234.2,311.8,98.2
non functional,716.6,222.2,2486.8


In [16]:
dtc_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.796558,0.303122,0.774917,0.731829,0.624866,0.752602
recall,0.768986,0.483773,0.725983,0.731829,0.65958,0.731829
f1-score,0.782519,0.372653,0.749624,0.731829,0.634932,0.740261
support,4840.2,644.2,3425.6,0.731829,8910.0,8910.0


# Third Model - K Nearest Neighbors

In [17]:
knn = KNeighborsClassifier(n_neighbors=3)

In [18]:
knn_reports, knn_matrices, knn_numpy_report, knn_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, knn)

In [19]:
knn_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3797.4,431.0,611.8
functional needs repair,274.8,290.2,79.2
non functional,838.0,219.0,2368.6


In [20]:
knn_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.773382,0.308702,0.774294,0.724602,0.618793,0.740172
recall,0.784566,0.450574,0.691456,0.724602,0.642198,0.724602
f1-score,0.778895,0.366271,0.73047,0.724602,0.625212,0.730458
support,4840.2,644.2,3425.6,0.724602,8910.0,8910.0


# Fourth Model – Bagging Classifier

In [21]:
bagged_tree = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50)

In [22]:
bagged_tree_reports, bagged_tree_matrices, bagged_tree_numpy_report, bagged_tree_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, bagged_tree)

In [23]:
bagged_tree_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4069.4,478.2,292.6
functional needs repair,261.8,326.0,56.4
non functional,951.4,201.0,2273.2


In [24]:
bagged_tree_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.770363,0.325231,0.866906,0.74844,0.654167,0.775399
recall,0.840807,0.506297,0.663596,0.74844,0.670233,0.74844
f1-score,0.803999,0.39547,0.751703,0.74844,0.650391,0.754392
support,4840.2,644.2,3425.6,0.74844,8910.0,8910.0


# Fifth Model – Random Forest

In [25]:
forest = RandomForestClassifier()

In [26]:
forest_reports, forest_matrices, forest_numpy_report, forest_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, forest)

In [27]:
forest_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3957.8,417.2,465.2
functional needs repair,243.8,320.0,80.4
non functional,694.2,178.8,2552.6


In [28]:
forest_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.808414,0.349499,0.823803,0.766599,0.660572,0.781256
recall,0.81769,0.496819,0.745146,0.766599,0.686551,0.766599
f1-score,0.812996,0.409931,0.782466,0.766599,0.668464,0.772175
support,4840.2,644.2,3425.6,0.766599,8910.0,8910.0


# Sixth Model – XGBoost

In [33]:
xgboost_y_train = y_train.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})

xgb = XGBClassifier()

In [34]:
xgboost_reports, xgboost_matrices, xgboost_numpy_report, xgboost_numpy_matrix = cval(X_train, xgboost_y_train, X_train_resampled, y_train_resampled, 5, xgb)

In [35]:
xgboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2366.0,249.6,810.0
functional needs repair,62.4,361.8,220.0
non functional,362.2,596.2,3881.8


In [36]:
xgboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.84784,0.299398,0.790266,0.741818,0.645835,0.777004
recall,0.690701,0.560977,0.801957,0.741818,0.684545,0.741818
f1-score,0.761222,0.390382,0.796065,0.741818,0.649223,0.753406
support,3425.6,644.2,4840.2,0.741818,8910.0,8910.0


# Eigth Model – Adaboost Classifier

In [42]:
# Instantiate an AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)

In [43]:
adaboost_reports, adaboost_matrices, adaboost_numpy_report, adaboost_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, adaboost_clf)

In [44]:
adaboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3771.6,469.6,599.0
functional needs repair,252.4,297.6,94.2
non functional,728.0,196.6,2501.0


In [45]:
adaboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.793688,0.308683,0.782956,0.737396,0.628442,0.75458
recall,0.779284,0.461958,0.730088,0.737396,0.65711,0.737396
f1-score,0.786389,0.370044,0.755555,0.737396,0.637329,0.744457
support,4840.2,644.2,3425.6,0.737396,8910.0,8910.0


# Ninth Model – Gradient Boosting Classifier

In [46]:
# Instantiate an GradientBoostingClassifier
gbt_clf = GradientBoostingClassifier(random_state=42, n_estimators=200, max_features=50)

In [47]:
gbt_reports, gbt_matrices, gbt_numpy_report, gbt_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, gbt_clf)

In [48]:
gbt_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3752.6,762.2,325.4
functional needs repair,211.2,378.6,54.4
non functional,939.2,371.4,2115.0


In [49]:
gbt_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.765495,0.250535,0.847781,0.701033,0.62127,0.759956
recall,0.775329,0.587589,0.617454,0.701033,0.660124,0.701033
f1-score,0.770297,0.351051,0.71448,0.701033,0.611943,0.718545
support,4840.2,644.2,3425.6,0.701033,8910.0,8910.0


# Eleventh Model – Extra Randomized Trees

In [50]:
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)

In [51]:
extra_trees_reports, extra_trees_matrices, extra_trees_numpy_report, extra_trees_numpy_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, extra_trees)

In [52]:
extra_trees_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3904.4,435.8,500.0
functional needs repair,246.0,314.4,83.8
non functional,687.8,192.8,2545.0


In [53]:
extra_trees_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.807139,0.333209,0.813377,0.759125,0.651242,0.77532
recall,0.806668,0.48789,0.742954,0.759125,0.679171,0.759125
f1-score,0.806836,0.395754,0.776523,0.759125,0.659704,0.765487
support,4840.2,644.2,3425.6,0.759125,8910.0,8910.0


In [54]:
# XGboost for non functional
# BaggingClassifier for functional
# LogisticRegression for functional needs repair

# Voting Classifier

In [55]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train)

In [56]:
strategy = {'functional needs repair': int(len(X_train_2)/2)}
smote = SMOTE(sampling_strategy=strategy)
X_train_2_resampled, y_train_2_resampled = smote.fit_resample(X_train, y_train)

In [57]:
vc_1 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50).fit(X_train_2_resampled, y_train_2_resampled)
vc_2 = XGBClassifier().fit(X_train_2_resampled, y_train_2_resampled.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2}))
vc_3 = LogisticRegression(solver='liblinear', fit_intercept=False).fit(X_train_2_resampled, y_train_2_resampled)

vc_preds_1 = vc_1.predict(X_test_2)
vc_preds_2 = vc_2.predict(X_test_2)
vc_preds_3 = vc_3.predict(X_test_2)

predictions_df = pd.DataFrame({'BaggingClassifier': vc_preds_1, 
                               'LogisticRegression': vc_preds_3, 
                               'XGBoost': vc_preds_2, 
                               'True Values': y_test_2})

In [58]:
predictions_df['XGBoost'].replace({0: 'functional', 1: 'functional needs repair', 2: 'non functional'}, inplace=True)

In [59]:
modes = []

for i in range(len(predictions_df)):
    arr = [predictions_df.BaggingClassifier.iloc[i], 
           predictions_df.LogisticRegression.iloc[i], 
           predictions_df.XGBoost.iloc[i]]
    mode = md(arr)
    modes.append(mode)

In [60]:
predictions_df['mode'] = modes

In [61]:
predictions_df

Unnamed: 0,BaggingClassifier,LogisticRegression,XGBoost,True Values,mode
23757,functional,functional needs repair,non functional,functional,functional
28643,non functional,non functional,functional,non functional,non functional
3203,non functional,non functional,functional,non functional,non functional
7529,functional,functional,non functional,functional,functional
42547,functional,functional needs repair,non functional,functional,functional
...,...,...,...,...,...
712,functional,functional,non functional,functional,functional
42897,functional,functional needs repair,functional,functional,functional
35051,non functional,non functional,functional,non functional,non functional
42715,functional,functional,non functional,non functional,functional


In [62]:
pd.DataFrame(classification_report(y_test_2, predictions_df['mode'], output_dict=True))

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.838575,0.449548,0.902277,0.808224,0.730133,0.833074
recall,0.862519,0.76,0.740128,0.808224,0.787549,0.808224
f1-score,0.850378,0.564932,0.813198,0.808224,0.742836,0.814477
support,6059.0,850.0,4229.0,0.808224,11138.0,11138.0


In [63]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, predictions_df['mode']), columns=labels, index=labels)

Unnamed: 0,functional,functional needs repair,non functional
functional,5226,528,305
functional needs repair,170,646,34
non functional,836,263,3130
