In [542]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from matplotlib import pyplot as plt
from IPython.display import clear_output
from collections import Counter

In [543]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group'].map({'functional': 0, 'functional needs repair': 1, 'non functional': 2})

In [544]:
X = X.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)
testing = testing.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)

In [545]:
# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
X.permit.fillna('Unknown', inplace=True)
X.public_meeting.fillna('Unknown', inplace=True)

In [546]:
# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['region_code'] = X['region_code'].astype('str')
X['district_code'] = X['district_code'].astype('str')

X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_numeric = X.select_dtypes(['float64', 'int64'])

In [547]:
X_cat = X_cat.drop(['quantity', 'waterpoint_type_group', 'extraction_type_group', 
                    'region', 'extraction_type', 'payment_type', 'source_type', 
                    'management_group', 'water_quality', 'source_class', 
                    'region_code', 'district_code'], axis=1)

testing = testing.drop(['quantity', 'waterpoint_type_group', 'extraction_type_group', 
                    'region', 'extraction_type', 'payment_type', 'source_type', 
                    'management_group', 'water_quality', 'source_class', 
                    'region_code', 'district_code'], axis=1)

In [548]:
X_cat = pd.get_dummies(X_cat)

In [549]:
scaler = StandardScaler()
scaler.fit(X_numeric)
X_numeric = pd.DataFrame(scaler.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

In [550]:
X = pd.concat([X_numeric, X_cat], axis=1)

In [551]:
df = pd.concat([X, y], axis=1)

In [552]:
strategy = {1: 20000}
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [553]:
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(X_resampled, y_resampled)

In [554]:
X_train_resampled = X_train_resampled.reset_index(drop=True)
y_train_resampled = y_train_resampled.reset_index(drop=True)
y_test_resampled = y_test_resampled.reset_index(drop=True)

In [555]:
df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)

In [556]:
f = df_resampled[df_resampled.status_group == 0].reset_index(drop=True).copy()
nf = df_resampled[df_resampled.status_group == 2].reset_index(drop=True).copy()
fnr = df_resampled[df_resampled.status_group == 1].reset_index(drop=True).copy()

In [557]:
f = f.sample(frac=1)
nf = nf.sample(frac=1)
fnr = fnr.sample(frac=1)

In [558]:
fs = [pd.DataFrame(i) for i in np.array_split(f, len(f)//1000)]
nfs = [pd.DataFrame(i) for i in np.array_split(nf, len(nf)//1000)]
fnrs = [pd.DataFrame(i) for i in np.array_split(fnr, len(fnr)//1000)]

In [559]:
resamples = []

for i in range(len(fnrs)):
    resample = pd.concat([fs[i], nfs[i], fnrs[i]])
    resamples.append(resample)

In [560]:
models = []
preds = []
modes=[]
counters=[]

for resample in resamples:
    estimator =  ExtraTreesClassifier(n_estimators=100, random_state=42)
    X = resample.drop(['status_group'], axis=1)
    y = resample['status_group']
    estimator.fit(X, y)
    pred = estimator.predict(X_test_resampled)
    preds.append(pred)
    models.append(estimator)
    
    
for i in range(len(preds[0])):
    mode = md([pred[i] for pred in preds])
    counter = Counter([pred[i] for pred in preds])
    counters.append(counter)
    modes.append(mode)

In [561]:
pred_df = pd.DataFrame({n: preds[n] for n in range(len(preds))})
pred_df['mode'] = modes
pred_df['original'] = y_test_resampled

In [562]:
matrix_labels = ['functional', 'functional needs repair', 
                 'non functional']

matrix = pd.DataFrame(confusion_matrix(pred_df['mode'], y_test_resampled), columns=matrix_labels, index=matrix_labels)
report = pd.DataFrame(classification_report(pred_df['mode'], y_test_resampled, output_dict=True))

In [563]:
matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,5826,746,1429
functional needs repair,1353,6987,744
non functional,880,370,5860


In [564]:
report

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.722918,0.862273,0.729491,0.771771,0.771561,0.777171
recall,0.728159,0.769155,0.824191,0.771771,0.773835,0.771771
f1-score,0.725529,0.813056,0.773955,0.771771,0.770847,0.772622
support,8001.0,9084.0,7110.0,0.771771,24195.0,24195.0
