In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from matplotlib import pyplot as plt
from IPython.display import clear_output, display_html 
from collections import Counter

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group'].map({'functional': 2, 'functional needs repair': 0, 'non functional': 1})

In [7]:
for col in X.columns:
    print(col, X[col].nunique())

id 59400
amount_tsh 98
date_recorded 356
funder 1896
gps_height 2428
installer 2145
longitude 57516
latitude 57517
wpt_name 37399
num_private 65
basin 9
subvillage 19287
region 21
region_code 27
district_code 20
lga 125
ward 2092
population 1049
public_meeting 2
recorded_by 1
scheme_management 11
scheme_name 2695
permit 2
construction_year 55
extraction_type 18
extraction_type_group 13
extraction_type_class 7
management 12
management_group 5
payment 7
payment_type 7
water_quality 8
quality_group 6
quantity 5
quantity_group 5
source 10
source_type 7
source_class 3
waterpoint_type 7
waterpoint_type_group 6


In [None]:
#Dropping unecessary columns
X = X.drop(['id', 'wpt_name', 'date_recorded', 'funder'], axis=1)

testing = testing.drop(['id', 'wpt_name', 'date_recorded', 'funder'], axis=1)

# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
# X.permit.fillna('Unknown', inplace=True)
# X.public_meeting.fillna('Unknown', inplace=True)

# Turning certain dtypes into others
# X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['district_code'] = X['district_code'].astype('str')

#Defining X_cat
X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_cat = pd.get_dummies(X_cat)

#Defining X_numeric
X_numeric = X.select_dtypes(['float64', 'int64'])
scaler = StandardScaler()
scaler.fit(X_numeric)
X_numeric = pd.DataFrame(scaler.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

#Defining X
X = pd.concat([X_numeric, X_cat], axis=1)

#Defining df
df = pd.concat([X, y], axis=1)

#Creating a train-test-split for X and y
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Resampling dataframes for model creation
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_train_resampled = X_train_resampled.reset_index(drop=True)
y_train_resampled = y_train_resampled.reset_index(drop=True)
    
df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)

In [45]:
estimators = { 
              'Random Forest': RandomForestClassifier(), 
            }

# Approach #1 – Single Model with Original Data

Here we use a single estimator on a training dataset and make predictions on a test dataset.

In [46]:
ap1_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train, y_train)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap1_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Random Forest completed.


# Approach #2 – Single Model with Resampled Data

Here we use a single estimator on a resampled training dataset and make predictions on a test dataset.

In [47]:
smote = SMOTE()

In [48]:
ap2_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train_resampled, y_train_resampled)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap2_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Random Forest completed.


# Approach #3 – Single Model with Resampled Data (Different Ratio)

In [49]:
strategy = {0: 10000} #optional strategy
smote = SMOTE(sampling_strategy=strategy)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [50]:
ap3_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train_resampled, y_train_resampled)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap3_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Random Forest completed.


# Approach #4 – Single Model with Resampled Data (Different Ratio)

In [51]:
strategy = {0: 15000} #optional strategy
smote = SMOTE(sampling_strategy=strategy)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [52]:
ap4_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train_resampled, y_train_resampled)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap4_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Random Forest completed.


# Approach #5 – Single Model with Resampled Data (Different Ratio)

In [53]:
strategy = {0: 20000} #optional strategy
smote = SMOTE(sampling_strategy=strategy)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [54]:
ap5_dict = {}

for name, estimator in estimators.items():
    
    estimator.fit(X_train_resampled, y_train_resampled)
    predictions = estimator.predict(X_test)
    matrix = pd.DataFrame(confusion_matrix(y_test, predictions))
    report = pd.DataFrame(classification_report(y_test, predictions, output_dict=True))
    ap5_dict[name] = [predictions, matrix, report]
    
    print(f"Estimator {name} completed.")

Estimator Random Forest completed.


# Looking at All of our Approaches

In [55]:
master_dict = {}
for name in estimators.keys():
    ap1 = ap1_dict[name][2]
    ap2 = ap2_dict[name][2]
    ap3 = ap3_dict[name][2]
    ap4 = ap4_dict[name][2]
    ap5 = ap5_dict[name][2]
#     ap6 = ap6_dict[name][3]
    master_dict[name]=[ap1,ap2,ap3,ap4,ap5]
    
print("\n")
for name, results in master_dict.items():
    print(color.BOLD + name + color.END)
    display(results[0].style.set_caption("original data"))
    display(results[1].style.set_caption("resampled data"))
    display(results[2].style.set_caption("resampled data (n = 10000)"))
    display(results[3].style.set_caption("resampled data (n = 15000)"))
    display(results[4].style.set_caption("resampled data (n = 20000)"))
#     display(results[5].style.set_caption("resampled samples"))
    print("\n\n")



[1mRandom Forest[0m


Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.48814,0.814897,0.808715,0.79367,0.703917,0.787368
recall,0.356102,0.776895,0.864788,0.79367,0.665928,0.79367
f1-score,0.411796,0.795442,0.835812,0.79367,0.681017,0.789071
support,1098.0,5661.0,8091.0,0.79367,14850.0,14850.0


Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.389675,0.808589,0.823088,0.776498,0.673784,0.785514
recall,0.501821,0.778308,0.812508,0.776498,0.697546,0.776498
f1-score,0.438694,0.793159,0.817763,0.776498,0.683206,0.780356
support,1098.0,5661.0,8091.0,0.776498,14850.0,14850.0


Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.413502,0.8211,0.814344,0.784781,0.682982,0.787281
recall,0.446266,0.772655,0.839204,0.784781,0.686042,0.784781
f1-score,0.42926,0.796141,0.826587,0.784781,0.683996,0.785603
support,1098.0,5661.0,8091.0,0.784781,14850.0,14850.0


Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.401991,0.82182,0.816396,0.781886,0.680069,0.787823
recall,0.478142,0.769122,0.832036,0.781886,0.6931,0.781886
f1-score,0.436772,0.794598,0.824142,0.781886,0.685171,0.784237
support,1098.0,5661.0,8091.0,0.781886,14850.0,14850.0


Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.387755,0.824558,0.815991,0.779461,0.676101,0.787593
recall,0.484517,0.766296,0.828699,0.779461,0.693171,0.779461
f1-score,0.430769,0.79436,0.822296,0.779461,0.682475,0.782697
support,1098.0,5661.0,8091.0,0.779461,14850.0,14850.0





