In [409]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from matplotlib import pyplot as plt
from IPython.display import clear_output
from collections import Counter

In [331]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group'].map({'functional': 0, 'functional needs repair': 1, 'non functional': 2})

In [332]:
X = X.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)
testing = testing.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)

In [333]:
# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
X.permit.fillna('Unknown', inplace=True)
X.public_meeting.fillna('Unknown', inplace=True)

In [334]:
# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['region_code'] = X['region_code'].astype('str')
X['district_code'] = X['district_code'].astype('str')

X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_numeric = X.select_dtypes(['float64', 'int64'])

In [335]:
X_cat = X_cat.drop(['quantity', 'waterpoint_type_group', 'extraction_type_group', 
                    'region', 'extraction_type', 'payment_type', 'source_type', 
                    'management_group', 'water_quality', 'source_class', 
                    'region_code', 'district_code'], axis=1)

testing = testing.drop(['quantity', 'waterpoint_type_group', 'extraction_type_group', 
                    'region', 'extraction_type', 'payment_type', 'source_type', 
                    'management_group', 'water_quality', 'source_class', 
                    'region_code', 'district_code'], axis=1)

In [336]:
X_cat = pd.get_dummies(X_cat)

In [337]:
scaler = StandardScaler()
scaler.fit(X_numeric)
X_numeric = pd.DataFrame(scaler.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

In [338]:
df = pd.concat([X_numeric, X_cat, y], axis=1)

In [339]:
X = pd.concat([X_numeric, X_cat], axis=1)

In [347]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [349]:
df_train, df_test = train_test_split(df)

In [353]:
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [355]:
df_train

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,population,construction_year,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,...,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,status_group
0,-0.10597,-0.964200,-5.188895,1.936878,-0.038749,-0.381587,-1.366788,False,False,False,...,False,False,False,True,False,False,False,False,False,0
1,-0.10597,1.106177,0.460238,0.862463,-0.038749,-0.379466,0.744365,False,False,False,...,True,False,False,True,False,False,False,False,False,0
2,-0.10597,-0.964200,-0.482288,1.433735,-0.038749,-0.381587,-1.366788,False,False,False,...,False,False,False,True,False,False,False,False,False,0
3,-0.10597,-0.964200,-0.039499,-1.284573,-0.038749,-0.381587,-1.366788,False,True,False,...,True,False,False,True,False,False,False,False,False,2
4,-0.10597,-0.964200,-0.366471,1.315739,-0.038749,-0.381587,-1.366788,False,False,False,...,True,False,False,True,False,False,False,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44545,-0.10597,0.852249,0.038591,1.268555,-0.038749,0.891006,0.745416,False,False,False,...,False,False,False,True,False,False,False,False,False,0
44546,-0.10597,-0.964200,-0.030997,0.481468,-0.038749,-0.381587,-1.366788,True,False,False,...,False,False,False,False,True,False,False,False,False,2
44547,-0.10597,-0.555896,0.400720,-0.808228,-0.038749,-0.233118,0.738060,False,False,False,...,False,False,False,False,False,False,True,False,False,2
44548,-0.10597,-0.964200,-0.172606,0.524636,-0.038749,-0.381587,-1.366788,True,False,False,...,False,False,False,False,False,False,True,False,False,0


In [357]:
df.status_group.unique()

array([0, 2, 1])

In [365]:
arrs = [df_train[df_train.status_group == i].reset_index(drop=True) for i in df.status_group.unique()]

In [374]:
idx = list(arrs[2].index)
arrs_fnr = np.array_split(idx, 6)

In [379]:
arrs[1]

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,population,construction_year,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,...,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,status_group
0,-0.105970,-0.964200,-0.039499,-1.284573,-0.038749,-0.381587,-1.366788,False,True,False,...,True,False,False,True,False,False,False,False,False,2
1,-0.105970,-0.964200,-0.366471,1.315739,-0.038749,-0.381587,-1.366788,False,False,False,...,True,False,False,True,False,False,False,False,False,2
2,-0.105970,-0.964200,-0.132192,0.627503,-0.038749,-0.381587,-1.366788,True,False,False,...,False,False,False,False,True,False,False,False,False,2
3,-0.105970,0.136634,-0.483281,-0.787769,-0.038749,-0.379466,0.718094,False,False,False,...,False,False,False,False,False,False,True,False,False,2
4,-0.105970,0.821951,-0.161925,1.232214,-0.038749,0.678907,0.746467,False,False,False,...,False,False,False,False,False,False,False,False,True,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17066,-0.105970,0.720957,-0.029773,1.463340,-0.038749,0.678907,0.721247,False,False,False,...,False,False,False,False,True,False,False,False,False,2
17067,-0.055929,0.267927,-0.655417,0.253862,-0.038749,0.445599,0.718094,False,False,False,...,False,False,False,False,True,False,False,False,False,2
17068,1.228453,-0.661218,0.722973,0.180173,-0.038749,-0.127068,0.704433,False,False,False,...,False,False,False,True,False,False,False,False,False,2
17069,-0.105970,-0.964200,-0.030997,0.481468,-0.038749,-0.381587,-1.366788,True,False,False,...,False,False,False,False,True,False,False,False,False,2


In [381]:
idx = list(arrs[0].index)
arrs_f = np.array_split(idx, 45)

In [382]:
idx = list(arrs[1].index)
arrs_nf = np.array_split(idx, 32)

In [415]:
models = []
preds = []
counters = []
modes = []

for i in range(len(arrs_fnr)):
    estimator = LogisticRegression(solver='liblinear', fit_intercept=False)
    fnr = arrs[2].take(arrs_fnr[i])
    nf = arrs[1].take(arrs_nf[i])
    f = arrs[0].take(arrs_f[i])
    dataframe = pd.concat([fnr, nf, f])
    estimator.fit(dataframe.drop(['status_group'], axis=1), dataframe['status_group'])
    predictions = estimator.predict(df_test.drop(['status_group'], axis=1))
    models.append(estimator)
    preds.append(predictions)
    
for i in range(len(preds[0])):
    mode = md([pred[i] for pred in preds])
    counter = Counter([pred[i] for pred in preds])
    counters.append(counter)
    modes.append(mode)

In [416]:
pred_df = pd.DataFrame({f"model {n}": preds[n] for n in range(len(preds))})
pred_df['mode'] = modes
pred_df['original'] = df_test['status_group']

In [420]:
matrix_labels = ['functional', 'functional needs repair', 
                 'non functional']

matrix = pd.DataFrame(confusion_matrix(pred_df['mode'], df_test['status_group']), columns=matrix_labels, index=matrix_labels)
report = pd.DataFrame(classification_report(pred_df['mode'], df_test['status_group'], output_dict=True))

In [421]:
matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,5122,280,1215
functional needs repair,1886,677,903
non functional,997,135,3635


In [422]:
report

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.63985,0.619963,0.631844,0.635286,0.630553,0.632639
recall,0.774067,0.195326,0.762534,0.635286,0.577309,0.635286
f1-score,0.700588,0.29706,0.691065,0.635286,0.562904,0.603347
support,6617.0,3466.0,4767.0,0.635286,14850.0,14850.0
