In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from matplotlib import pyplot as plt
from IPython.display import clear_output

In [2]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group']

# Finding columns between X_train and X_test that do not differ drastically

In [3]:
differences = []

columns = list(X.select_dtypes(exclude=['float64', 'int64']).columns)

for col in columns:
    difference = set(list(X[col])) ^ set(list(testing[col]))
    differences.append(len(difference))
    
pd.DataFrame({'column': list(columns), 'differences': differences}).sort_values(by=['differences'], ascending=False)

Unnamed: 0,column,differences
3,wpt_name,43128
5,subvillage,15120
2,installer,1584
1,funder,1403
12,scheme_name,1251
8,ward,145
0,date_recorded,51
14,extraction_type,1
23,quantity,0
21,water_quality,0


In [4]:
X = X.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)
testing = testing.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)

Eliminating recorded_by because it only has one value

Eliminating id because it's not important

Eliminating all the rest beacsue they are categorical variables that differ heavily between test and train sets

# Defining the train and test sets

In [6]:
# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
X.permit.fillna('Unknown', inplace=True)
X.public_meeting.fillna('Unknown', inplace=True)

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['region_code'] = X['region_code'].astype('str')
X['district_code'] = X['district_code'].astype('str')

X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_numeric = X.select_dtypes(['float64', 'int64'])

oe = OrdinalEncoder()
oe.fit(X_cat)
X_cat = pd.DataFrame(oe.transform(X_cat), index = X_cat.index, columns = X_cat.columns)

mms = MinMaxScaler()
mms.fit(X_numeric)
X_numeric = pd.DataFrame(mms.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

y = y.map({'functional': 0, 'functional needs repair': 1, 'non functional': 2})

# Feature Selection with Chi2

In [11]:
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X_cat, y)
# X_train_fs = fs.transform(X)

ch2_scores = pd.DataFrame({'feature': fs.feature_names_in_, 'score': fs.scores_, 'pvalue': fs.pvalues_})
ch2_scores['significant'] = ch2_scores.pvalue.map(lambda x: 'Yes' if x < 0.05 else 'No')
ch2_scores.sort_values(by=['score'], ascending=False).reset_index()

Unnamed: 0,index,feature,score,pvalue,significant
0,4,lga,9184.155815,0.0,Yes
1,10,extraction_type_class,4962.445269,0.0,Yes
2,9,extraction_type_group,3427.761791,0.0,Yes
3,22,waterpoint_type,3348.517448,0.0,Yes
4,8,extraction_type,2638.196579,0.0,Yes
5,23,waterpoint_type_group,2540.881101,0.0,Yes
6,1,region,1805.634614,0.0,Yes
7,2,region_code,1788.823521,0.0,Yes
8,13,payment,866.203572,8.059059e-189,Yes
9,3,district_code,705.287786,7.05835e-154,Yes


In [12]:
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X_numeric, y)
# X_train_fs = fs.transform(X)

scores = pd.DataFrame({'feature': fs.feature_names_in_, 'score': fs.scores_, 'pvalue': fs.pvalues_})
scores['significant'] = scores.pvalue.map(lambda x: 'Yes' if x < 0.05 else 'No')
scores.sort_values(by=['score'], ascending=False).reset_index()

Unnamed: 0,index,feature,score,pvalue,significant
0,1,gps_height,172.710476,3.1361489999999997e-38,Yes
1,6,construction_year,66.593012,3.463468e-15,Yes
2,2,longitude,27.408583,1.11764e-06,Yes
3,3,latitude,22.005588,1.665511e-05,Yes
4,0,amount_tsh,13.868288,0.0009739564,Yes
5,5,population,0.76877,0.6808692,No
6,4,num_private,0.401568,0.818089,No


# Feature Selection with Mutual Information

In [13]:
fs = SelectKBest(score_func=mutual_info_classif, k='all')
fs.fit(X_cat, y)

mi_scores = pd.DataFrame({'feature': fs.feature_names_in_, 'score': fs.scores_})
mi_scores.sort_values(by=['score'], ascending=False).reset_index()

Unnamed: 0,index,feature,score
0,18,quantity_group,0.108644
1,17,quantity,0.106354
2,4,lga,0.090986
3,22,waterpoint_type,0.064586
4,8,extraction_type,0.059604
5,9,extraction_type_group,0.058928
6,10,extraction_type_class,0.055349
7,23,waterpoint_type_group,0.053503
8,2,region_code,0.044873
9,1,region,0.039244


In [14]:
fs = SelectKBest(score_func=mutual_info_classif, k='all')
fs.fit(X_numeric, y)

scores = pd.DataFrame({'feature': fs.feature_names_in_, 'score': fs.scores_})
scores.sort_values(by=['score'], ascending=False).reset_index()

Unnamed: 0,index,feature,score
0,2,longitude,0.066897
1,3,latitude,0.06056
2,0,amount_tsh,0.036136
3,6,construction_year,0.035287
4,1,gps_height,0.019145
5,5,population,0.019062
6,4,num_private,0.007861


# Feature Selection with Decision Trees

In [15]:
forest = RandomForestClassifier(random_state=42, n_jobs=6, class_weight='balanced')
forest.fit(X_cat, y)

forest_scores = pd.DataFrame({'feature': forest.feature_names_in_, 'score': forest.feature_importances_})
forest_scores['cumsum'] = forest_scores['score'].cumsum()
forest_scores.sort_values(by=['score'], ascending=False).reset_index()

Unnamed: 0,index,feature,score,cumsum
0,17,quantity,0.092339,0.771569
1,18,quantity_group,0.088413,0.859982
2,4,lga,0.083201,0.30499
3,3,district_code,0.061294,0.221789
4,2,region_code,0.057613,0.160495
5,1,region,0.052727,0.102881
6,0,basin,0.050154,0.050154
7,14,payment_type,0.047735,0.641355
8,13,payment,0.045646,0.593621
9,22,waterpoint_type,0.045237,0.97321


In [16]:
forest = RandomForestClassifier(random_state=42, n_jobs=6, class_weight='balanced')
forest.fit(X_numeric, y)

scores = pd.DataFrame({'feature': forest.feature_names_in_, 'score': forest.feature_importances_})
scores['cumsum'] = scores['score'].cumsum()
scores.sort_values(by=['score'], ascending=False).reset_index()

Unnamed: 0,index,feature,score,cumsum
0,2,longitude,0.331271,0.50878
1,3,latitude,0.319055,0.827835
2,1,gps_height,0.132658,0.177509
3,5,population,0.085367,0.916226
4,6,construction_year,0.083774,1.0
5,0,amount_tsh,0.044851,0.044851
6,4,num_private,0.003024,0.830859


# Ranking variables based on importance through multiple feature selection methods

In [20]:
cols = list(X_cat.columns)

ch2_rankings = []
mi_rankings = []
forest_rankings = []

df = ch2_scores.sort_values(by=['score'], ascending=False).reset_index()
for col in cols:
    ch2_rankings.append(df[df.feature == col].index[0])
    
df = mi_scores.sort_values(by=['score'], ascending=False).reset_index()
for col in cols:
    mi_rankings.append(df[df.feature == col].index[0])
    
df = forest_scores.sort_values(by=['score'], ascending=False).reset_index()
for col in cols:
    forest_rankings.append(df[df.feature == col].index[0])

rankings = pd.DataFrame({'feature': cols, 'chi_squared': ch2_rankings, 'mutual_information': mi_rankings, 'random_forest': forest_rankings})
rankings['average'] = rankings.apply(lambda row: (row.chi_squared + row.mutual_information + row.random_forest)/3, axis=1)

rankings.sort_values(by=['average'])

Unnamed: 0,feature,chi_squared,mutual_information,random_forest,average
4,lga,0,2,2,1.333333
18,quantity_group,10,0,1,3.666667
17,quantity,11,1,0,4.0
22,waterpoint_type,3,3,9,5.0
2,region_code,7,8,4,6.333333
10,extraction_type_class,1,6,12,6.333333
1,region,6,9,5,6.666667
9,extraction_type_group,2,5,13,6.666667
8,extraction_type,4,4,14,7.333333
13,payment,8,10,8,8.666667


# Correlation

In [21]:
X_numeric.corr()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,population,construction_year
amount_tsh,1.0,0.07665,0.022134,-0.05267,0.002944,0.016288,0.067915
gps_height,0.07665,1.0,0.149155,-0.035751,0.007237,0.135003,0.658727
longitude,0.022134,0.149155,1.0,-0.425802,0.023873,0.08659,0.396732
latitude,-0.05267,-0.035751,-0.425802,1.0,0.006837,-0.022152,-0.245278
num_private,0.002944,0.007237,0.023873,0.006837,1.0,0.003818,0.026056
population,0.016288,0.135003,0.08659,-0.022152,0.003818,1.0,0.26091
construction_year,0.067915,0.658727,0.396732,-0.245278,0.026056,0.26091,1.0


In [22]:
X_cat = X_cat.drop(['quantity', 'waterpoint_type_group', 'extraction_type_group', 
                    'region', 'extraction_type', 'payment_type', 'source_type', 
                    'management_group', 'water_quality', 'source_class', 
                    'region_code', 'district_code'], axis=1)

testing = testing.drop(['quantity', 'waterpoint_type_group', 'extraction_type_group', 
                    'region', 'extraction_type', 'payment_type', 'source_type', 
                    'management_group', 'water_quality', 'source_class', 
                    'region_code', 'district_code'], axis=1)

In [23]:
X_cat.columns

Index(['basin', 'lga', 'public_meeting', 'scheme_management', 'permit',
       'extraction_type_class', 'management', 'payment', 'quality_group',
       'quantity_group', 'source', 'waterpoint_type'],
      dtype='object')

# Re-encoding datasets

In [24]:
X_cat = X[list(X_cat.columns)]

In [25]:
X_cat = pd.get_dummies(X_cat, dtype='int64')

In [26]:
X = pd.concat([X_numeric, X_cat], axis=1)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.reset_index(inplace=True, drop=True)
y_train = y_train.reset_index(drop=True)

# Test

In [41]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group']

In [42]:
df = pd.concat([X, y], axis=1)
df = df[df.status_group == 'functional needs repair']

In [43]:
X = df.drop(['status_group'], axis=1)
y = df['status_group']

In [44]:
# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
X.permit.fillna('Unknown', inplace=True)
X.public_meeting.fillna('Unknown', inplace=True)

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['region_code'] = X['region_code'].astype('str')
X['district_code'] = X['district_code'].astype('str')

X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_numeric = X.select_dtypes(['float64', 'int64'])

oe = OrdinalEncoder()
oe.fit(X_cat)
X_cat = pd.DataFrame(oe.transform(X_cat), index = X_cat.index, columns = X_cat.columns)

mms = MinMaxScaler()
mms.fit(X_numeric)
X_numeric = pd.DataFrame(mms.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

y = y.map({'functional': 0, 'functional needs repair': 1, 'non functional': 2})