In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from IPython.display import clear_output
from sklearn.ensemble import StackingClassifier

In [3]:
strategies = ['auto', 2, 3, 4]
resampling = [False, True]

strategy = strategies[0]
resampling = resampling[0]

In [4]:
def cval(X, y, cval, estimator, xgboost = False, resample = False):
    
    reports = []
    matrices = []
    numpy_reports = []
    numpy_matrices = []
    
    report_columns = ['functional', 'functional needs repair', 
                      'non functional', 'accuracy', 'macro avg', 
                      'weighted avg']
    
    report_rows = ['precision', 'recall', 
                   'f1-score', 'support']
    
    matrix_labels = ['functional', 'functional needs repair', 
                     'non functional']
    
    idx = list(X.index)
    np.random.shuffle(idx)
    
    for i in list(range(cval)):
        arrs = np.array_split(idx, cval)
        
        test = arrs.pop(i)
        train = np.concatenate(arrs)
        
        test_x = X.take(test)
        train_x = X.take(train)
        test_y = y.take(test)
        train_y = y.take(train)
            
        if resample:
            if xgboost:
                strategy = {1: int((len(train_x))/2)}
            else:
                strategy = {'functional needs repair': int((len(train_x))/2)}
            
            smote = SMOTE(sampling_strategy=strategy)
            
            train_x_resampled, train_y_resampled = smote.fit_resample(train_x, train_y)
            model = estimator
            model.fit(train_x_resampled, train_y_resampled)
            
        else:
            if xgboost:
                test_y = test_y.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})
                train_y = train_y.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})
            model = estimator
            model.fit(train_x, train_y)
            
        preds = model.predict(test_x)
        
        report = pd.DataFrame(classification_report(test_y, preds, output_dict=True))
        reports.append(report)
        numpy_reports.append(np.array(report))
        
        matrix = pd.DataFrame(confusion_matrix(test_y, preds))
        matrices.append(matrix)
        numpy_matrices.append(np.array(matrix))
        
        clear_output(wait=True)
        print(f"Fold #{i+1} out of {cval} done.")
    
    numpy_report = pd.DataFrame(np.sum(numpy_reports, axis=0)/cval, 
                                columns=report_columns, index=report_rows)
    
    numpy_matrix = pd.DataFrame(np.sum(numpy_matrices, axis=0)/cval, 
                                columns=matrix_labels, index=matrix_labels)
    
    print("Analyis complete.")
    
    return reports, matrices, numpy_report, numpy_matrix

In [5]:
X_test = pd.read_csv("tanzanian_water_wells/X_test.csv")
X_train = pd.read_csv("tanzanian_water_wells/X_train.csv")
y_train = pd.read_csv("tanzanian_water_wells/y_train.csv")

df = pd.concat([X_train, y_train], axis=1)

In [6]:
desc = {'amount_tsh': 'Total static head (amount water available to waterpoint)',
                    'date_recorded': 'The date the row was entered',
                    'funder': 'Who funded the well',
                    'gps_height': 'Altitude of the well',
                    'installer': 'Organization that installed the well',
                    'longitude': 'GPS coordinate',
                    'latitude': 'GPS coordinate',
                    'wpt_name': 'Name of the waterpoint if there is one',
                    'subvillage': 'Geographic location',
                    'region': 'Geographic location',
                    'region_code': 'Geographic location (coded)',
                    'district_code': 'Geographic location (coded)',
                    'lga': 'Geographic location',
                    'ward': 'Geographic location',
                    'population': 'Population around the well',
                    'public_meeting': 'True/False',
                    'recorded_by': 'Group entering this row of data',
                    'scheme_management': 'Who operates the waterpoint',
                    'scheme_name': 'Who operates the waterpoint',
                    'permit': 'If the waterpoint is permitted',
                    'construction_year': 'Year the waterpoint was constructed',
                    'extraction_type': 'The kind of extraction the waterpoint uses',
                    'extraction_type_group': 'The kind of extraction the waterpoint uses',
                    'extraction_type_class': 'The kind of extraction the waterpoint uses',
                    'management': 'How the waterpoint is managed',
                    'management_group': 'How the waterpoint is managed',
                    'payment': 'What the water costs',
                    'payment_type': 'What the water costs',
                    'water_quality': 'The quality of the water',
                    'quality_group': 'The quality of the water',
                    'quantity': 'The quantity of water',
                    'quantity_group': 'The quantity of water',
                    'source': 'The source of the water',
                    'source_type': 'The source of the water',
                    'source_class': 'The source of the water',
                    'waterpoint_type': 'The kind of waterpoint',
                    'waterpoint_type_group': 'The kind of waterpoint'}

In [7]:
# Eliminating null values

df.funder.fillna("Unknown", inplace=True)
df.installer.fillna("Unknown", inplace=True)
df.scheme_management.fillna("None", inplace=True)
df.permit.fillna('Unknown', inplace=True)
df.scheme_name.fillna('Unknown', inplace=True)
df.subvillage.fillna('Unknown', inplace=True)
df.public_meeting.fillna('Unknown', inplace=True)

# Defining the train and test sets

In [8]:
X = df.copy()

columns = ['amount_tsh', 'gps_height', 'population', 'region', 'lga', 
           'scheme_management', 'permit', 'construction_year',
           'extraction_type_group', 'payment', 'management', 
           'quality_group', 'quantity', 'source', 'waterpoint_type']

X = X[columns]

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
# X['district_code'] = X['district_code'].astype('object')

X_cat = X.drop(list(X.select_dtypes(['float64']).columns), axis=1)
X_numeric = X[list(X.select_dtypes(['float64']).columns)]

y = df['status_group']

X_cat = pd.get_dummies(X_cat)

X = pd.concat([X_numeric, X_cat], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),
                index = X_train.index,
                columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),
                index = X_test.index,
                columns = X_test.columns)

X_train.reset_index(inplace=True, drop=True)
y_train = y_train.reset_index(drop=True)

# Base Model – Logistic Regression, No Regularization

In [9]:
estimator = LogisticRegression(solver='liblinear', fit_intercept=False)
reports, matrices, numpy_report, numpy_matrix = cval(X_train, y_train, 5, estimator)

Fold #5 out of 5 done.
Analyis complete.


In [10]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.766942,0.232447,0.808717,0.700561,0.602702,0.744061
recall,0.786003,0.470264,0.623073,0.700561,0.626447,0.700561
f1-score,0.776336,0.311054,0.703779,0.700561,0.597056,0.714663
support,4845.8,648.6,3415.6,0.700561,8910.0,8910.0


In [11]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3809.0,611.8,425.0
functional needs repair,265.0,304.8,78.8
non functional,892.4,395.0,2128.2


# Second Model – Decision Tree

In [12]:
dtc = DecisionTreeClassifier()

In [13]:
dtc_reports, dtc_matrices, dtc_numpy_report, dtc_numpy_matrix = cval(X_train, y_train, 5, dtc)

Fold #5 out of 5 done.
Analyis complete.


In [14]:
dtc_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3954.0,238.2,653.6
functional needs repair,320.0,217.0,111.6
non functional,782.2,114.2,2519.2


In [15]:
dtc_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.781993,0.380701,0.767039,0.750864,0.643245,0.747124
recall,0.815941,0.334469,0.737624,0.750864,0.629345,0.750864
f1-score,0.798585,0.355978,0.752008,0.750864,0.635523,0.748544
support,4845.8,648.6,3415.6,0.750864,8910.0,8910.0


# Third Model - K Nearest Neighbors

In [16]:
knn = KNeighborsClassifier(n_neighbors=3)

In [17]:
knn_reports, knn_matrices, knn_numpy_report, knn_numpy_matrix = cval(X_train, y_train, 5, knn)

Fold #5 out of 5 done.
Analyis complete.


In [18]:
knn_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4075.6,140.0,630.2
functional needs repair,350.6,191.2,106.8
non functional,878.8,75.2,2461.6


In [19]:
knn_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.768267,0.4713,0.769749,0.755152,0.669772,0.747301
recall,0.841108,0.295283,0.720681,0.755152,0.619024,0.755152
f1-score,0.802976,0.362013,0.744329,0.755152,0.63644,0.7484
support,4845.8,648.6,3415.6,0.755152,8910.0,8910.0


# Fourth Model – Bagging Classifier

In [20]:
bagged_tree = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50)

In [21]:
bagged_tree_reports, bagged_tree_matrices, bagged_tree_numpy_report, bagged_tree_numpy_matrix = cval(X_train, y_train, 5, bagged_tree)

Fold #5 out of 5 done.
Analyis complete.


In [22]:
bagged_tree_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4539.4,11.0,295.4
functional needs repair,523.4,40.4,84.8
non functional,1140.2,6.2,2269.2


In [23]:
bagged_tree_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.731811,0.707088,0.856543,0.768687,0.765148,0.777872
recall,0.936778,0.06235,0.664438,0.768687,0.554522,0.768687
f1-score,0.821681,0.114252,0.748317,0.768687,0.561417,0.742056
support,4845.8,648.6,3415.6,0.768687,8910.0,8910.0


# Fifth Model – Random Forest

In [24]:
forest = RandomForestClassifier()

In [25]:
forest_reports, forest_matrices, forest_numpy_report, forest_numpy_matrix = cval(X_train, y_train, 5, forest)

Fold #5 out of 5 done.
Analyis complete.


In [26]:
forest_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4220.6,142.2,483.0
functional needs repair,330.0,208.6,110.0
non functional,772.8,64.8,2578.0


In [27]:
forest_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.792849,0.502156,0.813082,0.786442,0.702695,0.77953
recall,0.870999,0.321535,0.754789,0.786442,0.649108,0.786442
f1-score,0.830073,0.391471,0.782775,0.786442,0.668107,0.780043
support,4845.8,648.6,3415.6,0.786442,8910.0,8910.0


# Sixth Model – XGBoost

In [28]:
xgb = XGBClassifier()

In [29]:
xgboost_reports, xgboost_matrices, xgboost_numpy_report, xgboost_numpy_matrix = cval(X_train, y_train, 5, xgb, xgboost=True)

Fold #5 out of 5 done.
Analyis complete.


In [30]:
xgboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2426.4,28.8,960.4
functional needs repair,101.6,126.4,420.6
non functional,375.2,51.4,4419.2


In [31]:
xgboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.835802,0.612297,0.761894,0.782492,0.736665,0.779353
recall,0.710395,0.195037,0.911952,0.782492,0.605795,0.782492
f1-score,0.767952,0.295149,0.830184,0.782492,0.631095,0.767381
support,3415.6,648.6,4845.8,0.782492,8910.0,8910.0


# Eigth Model – Adaboost Classifier

In [32]:
# Instantiate an AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)

In [33]:
adaboost_reports, adaboost_matrices, adaboost_numpy_report, adaboost_numpy_matrix = cval(X_train, y_train, 5, adaboost_clf)

Fold #5 out of 5 done.
Analyis complete.


In [34]:
adaboost_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,3977.0,227.6,641.2
functional needs repair,313.6,218.8,116.2
non functional,803.6,101.8,2510.2


In [35]:
adaboost_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.780688,0.398964,0.76821,0.752637,0.649287,0.748155
recall,0.820715,0.337628,0.734952,0.752637,0.631098,0.752637
f1-score,0.800194,0.365531,0.751194,0.752637,0.638973,0.749768
support,4845.8,648.6,3415.6,0.752637,8910.0,8910.0


# Ninth Model – Gradient Boosting Classifier

In [36]:
# Instantiate an GradientBoostingClassifier
gbt_clf = GradientBoostingClassifier(random_state=42, n_estimators=200, max_features=50)

In [37]:
gbt_reports, gbt_matrices, gbt_numpy_report, gbt_numpy_matrix = cval(X_train, y_train, 5, gbt_clf)

Fold #5 out of 5 done.
Analyis complete.


In [38]:
gbt_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4467.0,31.6,347.2
functional needs repair,471.0,81.8,95.8
non functional,1187.4,19.6,2208.6


In [39]:
gbt_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.72919,0.613745,0.8331,0.758406,0.725345,0.760738
recall,0.921807,0.125956,0.646596,0.758406,0.564786,0.758406
f1-score,0.814233,0.208705,0.728053,0.758406,0.583664,0.737194
support,4845.8,648.6,3415.6,0.758406,8910.0,8910.0


# Eleventh Model – Extra Randomized Trees

In [40]:
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)

In [41]:
extra_trees_reports, extra_trees_matrices, extra_trees_numpy_report, extra_trees_numpy_matrix = cval(X_train, y_train, 5, extra_trees)

Fold #5 out of 5 done.
Analyis complete.


In [42]:
extra_trees_numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,4141.8,174.2,529.8
functional needs repair,325.8,215.8,107.0
non functional,778.8,78.0,2558.8


In [43]:
extra_trees_numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.78945,0.461643,0.80076,0.776251,0.683951,0.769967
recall,0.854723,0.332793,0.749137,0.776251,0.645551,0.776251
f1-score,0.820785,0.386571,0.774065,0.776251,0.660474,0.771284
support,4845.8,648.6,3415.6,0.776251,8910.0,8910.0


In [44]:
# XGboost for non functional
# BaggingClassifier for functional
# LogisticRegression for functional needs repair

# Voting Classifier

In [45]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train)

In [46]:
strategy = {'functional needs repair': int(len(X_train_2)/2)}
smote = SMOTE(sampling_strategy=strategy)
X_train_2_resampled, y_train_2_resampled = smote.fit_resample(X_train, y_train)

In [47]:
vc_1 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50).fit(X_train_2_resampled, y_train_2_resampled)
vc_2 = XGBClassifier().fit(X_train_2_resampled, y_train_2_resampled.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2}))
vc_3 = LogisticRegression(solver='liblinear', fit_intercept=False).fit(X_train_2_resampled, y_train_2_resampled)

vc_preds_1 = vc_1.predict(X_test_2)
vc_preds_2 = vc_2.predict(X_test_2)
vc_preds_3 = vc_3.predict(X_test_2)

predictions_df = pd.DataFrame({'BaggingClassifier': vc_preds_1, 
                               'LogisticRegression': vc_preds_3, 
                               'XGBoost': vc_preds_2, 
                               'True Values': y_test_2})

In [48]:
predictions_df['XGBoost'].replace({0: 'functional', 1: 'functional needs repair', 2: 'non functional'}, inplace=True)

In [49]:
modes = []

for i in range(len(predictions_df)):
    arr = [predictions_df.BaggingClassifier.iloc[i], 
           predictions_df.LogisticRegression.iloc[i], 
           predictions_df.XGBoost.iloc[i]]
    mode = md(arr)
    modes.append(mode)

In [50]:
predictions_df['mode'] = modes

In [51]:
predictions_df

Unnamed: 0,BaggingClassifier,LogisticRegression,XGBoost,True Values,mode
19343,functional,functional,non functional,functional,functional
30823,non functional,non functional,functional,non functional,non functional
8213,functional,functional,non functional,functional,functional
31516,functional,functional,non functional,functional,functional
35054,functional,functional,non functional,functional,functional
...,...,...,...,...,...
25496,non functional,functional,functional,non functional,functional
1399,functional,functional,non functional,functional,functional
5022,functional needs repair,functional needs repair,functional needs repair,functional needs repair,functional needs repair
12140,non functional,non functional,functional,non functional,non functional


In [52]:
pd.DataFrame(classification_report(y_test_2, predictions_df['mode'], output_dict=True))

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.841211,0.43422,0.902494,0.81002,0.725975,0.835751
recall,0.859292,0.758794,0.750116,0.81002,0.789401,0.81002
f1-score,0.850155,0.552355,0.81928,0.81002,0.740597,0.816969
support,6048.0,796.0,4294.0,0.81002,11138.0,11138.0


In [53]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, predictions_df['mode']), columns=labels, index=labels)

Unnamed: 0,functional,functional needs repair,non functional
functional,5197,533,318
functional needs repair,162,604,30
non functional,819,254,3221


# Stacking Combo 1

In [54]:
stck = StackingClassifier(estimators = [('bagging_classifier', 
                                         BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50)), 
                                        ('xgboost', XGBClassifier())], 
                          final_estimator = LogisticRegression(solver='liblinear', fit_intercept=False))

In [None]:
stck.fit(X_train_2_resampled, y_train_2_resampled)

In [None]:
stck_preds = stck.predict(X_test_2)

In [None]:
pd.DataFrame(classification_report(y_test_2, stck_preds, output_dict=True))

In [None]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, stck_preds), columns=labels, index=labels)

# Stacking Combo 2

In [None]:
stck = StackingClassifier(estimators = [('bagging_classifier', BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50)), 
                                        ('logistic_regression', LogisticRegression(solver='liblinear', fit_intercept=False))], 
                          final_estimator = XGBClassifier())

In [None]:
stck.fit(X_train_2_resampled, y_train_2_resampled)

In [None]:
stck_preds = stck.predict(X_test_2)

In [None]:
pd.DataFrame(classification_report(y_test_2, stck_preds, output_dict=True))

In [None]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, stck_preds), columns=labels, index=labels)

# Stacking Combo 3 (Winner!)

In [None]:
stck = StackingClassifier(estimators = [('logistic_regression', LogisticRegression(solver='liblinear', fit_intercept=False)), 
                                       ('bagging_classifier', BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50))], 
                          final_estimator = XGBClassifier())

In [None]:
stck.fit(X_train_2_resampled, y_train_2_resampled)

In [None]:
stck_preds = stck.predict(X_test_2)

In [None]:
pd.DataFrame(classification_report(y_test_2, stck_preds, output_dict=True))

In [None]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, stck_preds), columns=labels, index=labels)

# Stacking Combo 4

In [None]:
stck = StackingClassifier(estimators = [('logistic_regression', LogisticRegression(solver='liblinear', fit_intercept=False)), 
                                       ('xgboost', XGBClassifier())], 
                          final_estimator = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50))

In [None]:
stck.fit(X_train_2_resampled, y_train_2_resampled)

In [None]:
stck_preds = stck.predict(X_test_2)

In [None]:
pd.DataFrame(classification_report(y_test_2, stck_preds, output_dict=True))

In [None]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, stck_preds), columns=labels, index=labels)

# Stacking Combo 5

In [None]:
stck = StackingClassifier(estimators = [('xgboost', XGBClassifier()),
                                       ('logistic_regression', LogisticRegression(solver='liblinear', fit_intercept=False))], 
                          final_estimator = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50))

In [None]:
stck.fit(X_train_2_resampled, y_train_2_resampled)

In [None]:
stck_preds = stck.predict(X_test_2)

In [None]:
pd.DataFrame(classification_report(y_test_2, stck_preds, output_dict=True))

In [None]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, stck_preds), columns=labels, index=labels)

# Stacking Combo 6

In [None]:
stck = StackingClassifier(estimators = [('xgboost', XGBClassifier()),
                                       ('bagging_classifier', BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50))], 
                          final_estimator = LogisticRegression(solver='liblinear', fit_intercept=False))

In [None]:
stck.fit(X_train_2_resampled, y_train_2_resampled)

In [None]:
stck_preds = stck.predict(X_test_2)

In [None]:
pd.DataFrame(classification_report(y_test_2, stck_preds, output_dict=True))

In [None]:
labels = ['functional', 'functional needs repair', 'non functional']
pd.DataFrame(confusion_matrix(y_test_2, stck_preds), columns=labels, index=labels)