In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE
from category_encoders import OrdinalEncoder
from myfunctions import set_importer
pd.set_option('display.max_columns', None)

sns.set_style("whitegrid")
%matplotlib inline

# setting font size for all plots
font = {'size'   : 16}

plt.rc('font', **font)

In [2]:
# importing the cleaned X_train dataset and y_train
X_train = set_importer('../analysis-dfs/X_train_cleaned.csv')
y_train = set_importer('../analysis-dfs/y_train_cleaned.csv', y=True)


# importing the cleaned X_test dataset and y_test
X_test = set_importer('../analysis-dfs/X_test_cleaned.csv')
y_test = set_importer('../analysis-dfs/y_test_cleaned.csv', y=True)

In [3]:
display(X_train.shape)
X_train.head(10)

(17807, 16)

Unnamed: 0,gps_height,longitude,latitude,basin,region_code,district_code,population,permit,construction_year,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group
0,340,39.483463,-10.60527,ruvuma / southern coast,9,4,300.0,True,1982,submersible,user-group,never pay,good,dry,groundwater,communal standpipe
1,0,34.081729,-8.789536,rufiji,12,7,1.0,True,0,gravity,user-group,pay monthly,good,insufficient,surface,communal standpipe
2,0,34.294701,-8.701257,rufiji,12,7,1.0,True,0,gravity,user-group,never pay,good,seasonal,surface,communal standpipe
3,123,37.85137,-7.197111,wami / ruvu,5,2,250.0,True,1997,other,user-group,pay monthly,salty,insufficient,groundwater,other
4,1209,33.591998,-2.129478,lake victoria,20,4,300.0,False,2008,other,user-group,unknown,unknown,unknown,groundwater,other
5,1170,32.908859,-1.936028,lake victoria,19,1,500.0,True,1996,other,user-group,never pay,good,insufficient,groundwater,other
6,0,31.47467,-1.471748,lake victoria,18,2,1.0,True,0,handpump,user-group,never pay,good,insufficient,groundwater,hand pump
7,0,34.803164,-2e-08,lake victoria,17,1,1.0,False,0,handpump,parastatal,never pay,salty,enough,groundwater,hand pump
8,1043,35.078078,-10.79965,ruvuma / southern coast,10,3,50.0,True,1990,gravity,user-group,unknown,good,dry,groundwater,communal standpipe
9,2117,33.933526,-8.957378,rufiji,11,3,1.0,False,1974,gravity,user-group,pay when scheme fails,good,enough,groundwater,communal standpipe


# Data Conversion

## Label Encoding

#### The permit column 
The permit column contains boolean information which can also be interpreted in a binary format.

In [4]:
# Using label encoder to transform the permit column
le = LabelEncoder()
le.fit(X_train.permit)
X_train.permit = le.transform(X_train.permit)
X_train.permit.value_counts()

1    12374
0     5433
Name: permit, dtype: int64

In [5]:
#  reviweing the new dataset
X_train.head(10)

Unnamed: 0,gps_height,longitude,latitude,basin,region_code,district_code,population,permit,construction_year,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group
0,340,39.483463,-10.60527,ruvuma / southern coast,9,4,300.0,1,1982,submersible,user-group,never pay,good,dry,groundwater,communal standpipe
1,0,34.081729,-8.789536,rufiji,12,7,1.0,1,0,gravity,user-group,pay monthly,good,insufficient,surface,communal standpipe
2,0,34.294701,-8.701257,rufiji,12,7,1.0,1,0,gravity,user-group,never pay,good,seasonal,surface,communal standpipe
3,123,37.85137,-7.197111,wami / ruvu,5,2,250.0,1,1997,other,user-group,pay monthly,salty,insufficient,groundwater,other
4,1209,33.591998,-2.129478,lake victoria,20,4,300.0,0,2008,other,user-group,unknown,unknown,unknown,groundwater,other
5,1170,32.908859,-1.936028,lake victoria,19,1,500.0,1,1996,other,user-group,never pay,good,insufficient,groundwater,other
6,0,31.47467,-1.471748,lake victoria,18,2,1.0,1,0,handpump,user-group,never pay,good,insufficient,groundwater,hand pump
7,0,34.803164,-2e-08,lake victoria,17,1,1.0,0,0,handpump,parastatal,never pay,salty,enough,groundwater,hand pump
8,1043,35.078078,-10.79965,ruvuma / southern coast,10,3,50.0,1,1990,gravity,user-group,unknown,good,dry,groundwater,communal standpipe
9,2117,33.933526,-8.957378,rufiji,11,3,1.0,0,1974,gravity,user-group,pay when scheme fails,good,enough,groundwater,communal standpipe


#### The target column
The target column can either be a 1 or 0

In [6]:
y_train.value_counts()

non functional             14926
functional needs repair     2881
Name: status_group, dtype: int64

In [7]:
# le = LabelEncoder()
y_train_transformed = y_train.to_frame().status_group.apply(lambda x: 1 if x == 'non functional' else 0)
y_train_transformed.value_counts()

1    14926
0     2881
Name: status_group, dtype: int64

1 represents non functional water pumps and 0 represents functional but need repair water pumps

# Data Scaling

All of the features of numerical data type are in a different scale this will hinder the training process of a model.

In [8]:
X_train

Unnamed: 0,gps_height,longitude,latitude,basin,region_code,district_code,population,permit,construction_year,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group
0,340,39.483463,-10.605272,ruvuma / southern coast,9,4,300.0,1,1982,submersible,user-group,never pay,good,dry,groundwater,communal standpipe
1,0,34.081729,-8.789536,rufiji,12,7,1.0,1,0,gravity,user-group,pay monthly,good,insufficient,surface,communal standpipe
2,0,34.294701,-8.701257,rufiji,12,7,1.0,1,0,gravity,user-group,never pay,good,seasonal,surface,communal standpipe
3,123,37.851370,-7.197111,wami / ruvu,5,2,250.0,1,1997,other,user-group,pay monthly,salty,insufficient,groundwater,other
4,1209,33.591998,-2.129478,lake victoria,20,4,300.0,0,2008,other,user-group,unknown,unknown,unknown,groundwater,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,0,32.817031,-2.526898,lake victoria,19,5,1.0,1,0,handpump,user-group,never pay,good,dry,groundwater,hand pump
17803,0,32.791439,-5.790918,lake tanganyika,14,5,1.0,1,0,other,user-group,never pay,good,dry,groundwater,other
17804,0,33.184941,-2.944274,lake victoria,19,7,1.0,1,0,other,user-group,other,milky,insufficient,groundwater,other
17805,0,34.117387,-3.820336,internal,17,8,1.0,1,0,gravity,parastatal,unknown,good,insufficient,surface,communal standpipe


In [9]:
numerics = ['gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'permit', 'construction_year'] # selecting columns to scale
numericals = X_train[numerics]

# selecting non-numerical dtypes.
not_numericals = X_train.drop(numerics, axis=1)

In [10]:
# Using standardscaler I will set all numerical values to be on the same scale.
sc = StandardScaler()
numericals_scaled = sc.fit_transform(numericals)

numericals_scaled_df = pd.DataFrame(numericals_scaled, columns=numericals.columns, index=numericals.index)

# dropping the numerical columns and then adding the new scaled columns
X_train_scaled = pd.concat([numericals_scaled_df, not_numericals], axis = 1)
X_train_scaled

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,permit,construction_year,basin,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group
0,-0.356274,1.723728,-1.640197,-0.393261,-0.194167,1.366671,0.66262,0.801349,ruvuma / southern coast,submersible,user-group,never pay,good,dry,groundwater,communal standpipe
1,-0.884573,-0.348901,-1.036569,-0.240379,0.092932,-0.675954,0.66262,-1.231119,rufiji,gravity,user-group,pay monthly,good,insufficient,surface,communal standpipe
2,-0.884573,-0.267184,-1.007222,-0.240379,0.092932,-0.675954,0.66262,-1.231119,rufiji,gravity,user-group,never pay,good,seasonal,surface,communal standpipe
3,-0.693453,1.097498,-0.507180,-0.597103,-0.385566,1.025095,0.66262,0.816731,wami / ruvu,other,user-group,pay monthly,salty,insufficient,groundwater,other
4,0.993994,-0.536809,1.177516,0.167305,-0.194167,1.366671,-1.50916,0.828011,lake victoria,other,user-group,unknown,unknown,unknown,groundwater,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,-0.884573,-0.834161,1.045397,0.116345,-0.098467,-0.675954,0.66262,-1.231119,lake victoria,handpump,user-group,never pay,good,dry,groundwater,hand pump
17803,-0.884573,-0.843981,-0.039702,-0.138458,-0.098467,-0.675954,0.66262,-1.231119,lake tanganyika,other,user-group,never pay,good,dry,groundwater,other
17804,-0.884573,-0.692996,0.906643,0.116345,0.092932,-0.675954,0.66262,-1.231119,lake victoria,other,user-group,other,milky,insufficient,groundwater,other
17805,-0.884573,-0.335219,0.615404,0.014424,0.188631,-0.675954,0.66262,-1.231119,internal,gravity,parastatal,unknown,good,insufficient,surface,communal standpipe


## Ordinal encoding

In [11]:
# function for encoding remaining categorical columns
def myScaler(set):
    numerics = ['gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'permit', 'construction_year'] # selecting columns to scale
    columns_to_scale = set.drop(numerics, axis=1)
    
    # Using standardscaler I will set all numerical values to be on the same scale.
    sc = StandardScaler()
    numericals_scaled = sc.fit_transform(columns_to_scale)

    numericals_scaled_df = pd.DataFrame(numericals_scaled, columns=columns_to_scale.columns, index=columns_to_scale.index)

    # dropping the numerical columns and then adding the new scaled columns
    result = pd.concat([set[numerics], numericals_scaled_df], axis = 1)

    return result

In [12]:
# Using ordinalencoder encoder I will convert the categorical values into categorical columns into numbers 
columns = ['basin', 'extraction_type_class', 'management_group', 'payment', 'quality_group', 'quantity_group', 'source_class', 'waterpoint_type_group']
oe = OrdinalEncoder(cols=columns)
X_train_ordinal_encoded = oe.fit_transform(X_train_scaled)
X_train_ordinal_encoded

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,permit,construction_year,basin,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group
0,-0.356274,1.723728,-1.640197,-0.393261,-0.194167,1.366671,0.66262,0.801349,1,1,1,1,1,1,1,1
1,-0.884573,-0.348901,-1.036569,-0.240379,0.092932,-0.675954,0.66262,-1.231119,2,2,1,2,1,2,2,1
2,-0.884573,-0.267184,-1.007222,-0.240379,0.092932,-0.675954,0.66262,-1.231119,2,2,1,1,1,3,2,1
3,-0.693453,1.097498,-0.507180,-0.597103,-0.385566,1.025095,0.66262,0.816731,3,3,1,2,2,2,1,2
4,0.993994,-0.536809,1.177516,0.167305,-0.194167,1.366671,-1.50916,0.828011,4,3,1,3,3,4,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,-0.884573,-0.834161,1.045397,0.116345,-0.098467,-0.675954,0.66262,-1.231119,4,4,1,1,1,1,1,3
17803,-0.884573,-0.843981,-0.039702,-0.138458,-0.098467,-0.675954,0.66262,-1.231119,8,3,1,1,1,1,1,2
17804,-0.884573,-0.692996,0.906643,0.116345,0.092932,-0.675954,0.66262,-1.231119,4,3,1,6,4,2,1,2
17805,-0.884573,-0.335219,0.615404,0.014424,0.188631,-0.675954,0.66262,-1.231119,9,2,2,3,1,2,2,1


In [13]:
X_train_ordinal_encoded = myScaler(X_train_ordinal_encoded)
X_train_ordinal_encoded

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,permit,construction_year,basin,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group
0,-0.356274,1.723728,-1.640197,-0.393261,-0.194167,1.366671,0.66262,0.801349,-1.550911,-1.417821,-0.314342,-0.783634,-0.399189,-1.266481,-0.529080,-0.832690
1,-0.884573,-0.348901,-1.036569,-0.240379,0.092932,-0.675954,0.66262,-1.231119,-1.159302,-0.652663,-0.314342,-0.190823,-0.399189,-0.677096,1.788588,-0.832690
2,-0.884573,-0.267184,-1.007222,-0.240379,0.092932,-0.675954,0.66262,-1.231119,-1.159302,-0.652663,-0.314342,-0.783634,-0.399189,-0.087711,1.788588,-0.832690
3,-0.693453,1.097498,-0.507180,-0.597103,-0.385566,1.025095,0.66262,0.816731,-0.767692,0.112494,-0.314342,-0.190823,0.853827,-0.677096,-0.529080,0.319603
4,0.993994,-0.536809,1.177516,0.167305,-0.194167,1.366671,-1.50916,0.828011,-0.376083,0.112494,-0.314342,0.401987,2.106843,0.501674,-0.529080,0.319603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,-0.884573,-0.834161,1.045397,0.116345,-0.098467,-0.675954,0.66262,-1.231119,-0.376083,0.877651,-0.314342,-0.783634,-0.399189,-1.266481,-0.529080,1.471897
17803,-0.884573,-0.843981,-0.039702,-0.138458,-0.098467,-0.675954,0.66262,-1.231119,1.190354,0.112494,-0.314342,-0.783634,-0.399189,-1.266481,-0.529080,0.319603
17804,-0.884573,-0.692996,0.906643,0.116345,0.092932,-0.675954,0.66262,-1.231119,-0.376083,0.112494,-0.314342,2.180420,3.359859,-0.677096,-0.529080,0.319603
17805,-0.884573,-0.335219,0.615404,0.014424,0.188631,-0.675954,0.66262,-1.231119,1.581964,-0.652663,1.079108,0.401987,-0.399189,-0.677096,1.788588,-0.832690


## Onehot encoding

Onehot encoding is necessary for logistic regression and KNN models

In [14]:
X_train_encoded = pd.get_dummies(X_train_scaled, drop_first=True)
X_train_encoded

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,permit,construction_year,basin_lake nyasa,basin_lake rukwa,basin_lake tanganyika,basin_lake victoria,basin_pangani,basin_rufiji,basin_ruvuma / southern coast,basin_wami / ruvu,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered,management_group_other,management_group_parastatal,management_group_unknown,management_group_user-group,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,quality_group_fluoride,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown,quantity_group_enough,quantity_group_insufficient,quantity_group_seasonal,quantity_group_unknown,source_class_surface,source_class_unknown,waterpoint_type_group_communal standpipe,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,-0.356274,1.723728,-1.640197,-0.393261,-0.194167,1.366671,0.66262,0.801349,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
1,-0.884573,-0.348901,-1.036569,-0.240379,0.092932,-0.675954,0.66262,-1.231119,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0
2,-0.884573,-0.267184,-1.007222,-0.240379,0.092932,-0.675954,0.66262,-1.231119,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0
3,-0.693453,1.097498,-0.507180,-0.597103,-0.385566,1.025095,0.66262,0.816731,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1
4,0.993994,-0.536809,1.177516,0.167305,-0.194167,1.366671,-1.50916,0.828011,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,-0.884573,-0.834161,1.045397,0.116345,-0.098467,-0.675954,0.66262,-1.231119,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
17803,-0.884573,-0.843981,-0.039702,-0.138458,-0.098467,-0.675954,0.66262,-1.231119,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
17804,-0.884573,-0.692996,0.906643,0.116345,0.092932,-0.675954,0.66262,-1.231119,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1
17805,-0.884573,-0.335219,0.615404,0.014424,0.188631,-0.675954,0.66262,-1.231119,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0


In [15]:
X_train_encoded = myScaler(X_train_encoded)
X_train_encoded

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,permit,construction_year,basin_lake nyasa,basin_lake rukwa,basin_lake tanganyika,basin_lake victoria,basin_pangani,basin_rufiji,basin_ruvuma / southern coast,basin_wami / ruvu,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered,management_group_other,management_group_parastatal,management_group_unknown,management_group_user-group,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,quality_group_fluoride,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown,quantity_group_enough,quantity_group_insufficient,quantity_group_seasonal,quantity_group_unknown,source_class_surface,source_class_unknown,waterpoint_type_group_communal standpipe,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,-0.356274,1.723728,-1.640197,-0.393261,-0.194167,1.366671,0.66262,0.801349,-0.272276,-0.24457,-0.365450,-0.484293,-0.398664,-0.340700,3.031980,-0.334300,-0.535902,-0.277124,-0.493551,-0.072457,2.973812,-0.048039,-0.115144,-0.162114,-0.113125,0.345098,-0.129342,-0.185442,-0.335648,-0.342545,-0.24172,-0.463142,-0.048039,0.476945,-0.116884,-0.330131,-0.256843,-0.859927,-0.597177,-0.258207,-0.143441,-0.528430,-0.070874,0.918978,-0.569677,-0.090922,-0.505487
1,-0.884573,-0.348901,-1.036569,-0.240379,0.092932,-0.675954,0.66262,-1.231119,-0.272276,-0.24457,-0.365450,-0.484293,-0.398664,2.935134,-0.329817,-0.334300,-0.535902,-0.277124,-0.493551,-0.072457,-0.336269,-0.048039,-0.115144,-0.162114,-0.113125,0.345098,-0.129342,-0.185442,2.979315,-0.342545,-0.24172,-0.463142,-0.048039,0.476945,-0.116884,-0.330131,-0.256843,-0.859927,1.674545,-0.258207,-0.143441,1.892397,-0.070874,0.918978,-0.569677,-0.090922,-0.505487
2,-0.884573,-0.267184,-1.007222,-0.240379,0.092932,-0.675954,0.66262,-1.231119,-0.272276,-0.24457,-0.365450,-0.484293,-0.398664,2.935134,-0.329817,-0.334300,-0.535902,-0.277124,-0.493551,-0.072457,-0.336269,-0.048039,-0.115144,-0.162114,-0.113125,0.345098,-0.129342,-0.185442,-0.335648,-0.342545,-0.24172,-0.463142,-0.048039,0.476945,-0.116884,-0.330131,-0.256843,-0.859927,-0.597177,3.872867,-0.143441,1.892397,-0.070874,0.918978,-0.569677,-0.090922,-0.505487
3,-0.693453,1.097498,-0.507180,-0.597103,-0.385566,1.025095,0.66262,0.816731,-0.272276,-0.24457,-0.365450,-0.484293,-0.398664,-0.340700,-0.329817,2.991328,-0.535902,-0.277124,2.026134,-0.072457,-0.336269,-0.048039,-0.115144,-0.162114,-0.113125,0.345098,-0.129342,-0.185442,2.979315,-0.342545,-0.24172,-0.463142,-0.048039,-2.096678,-0.116884,3.029097,-0.256843,-0.859927,1.674545,-0.258207,-0.143441,-0.528430,-0.070874,-1.088165,-0.569677,-0.090922,1.978290
4,0.993994,-0.536809,1.177516,0.167305,-0.194167,1.366671,-1.50916,0.828011,-0.272276,-0.24457,-0.365450,2.064866,-0.398664,-0.340700,-0.329817,-0.334300,-0.535902,-0.277124,2.026134,-0.072457,-0.336269,-0.048039,-0.115144,-0.162114,-0.113125,0.345098,-0.129342,-0.185442,-0.335648,-0.342545,-0.24172,2.159167,-0.048039,-2.096678,-0.116884,-0.330131,3.893431,-0.859927,-0.597177,-0.258207,6.971490,-0.528430,-0.070874,-1.088165,-0.569677,-0.090922,1.978290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,-0.884573,-0.834161,1.045397,0.116345,-0.098467,-0.675954,0.66262,-1.231119,-0.272276,-0.24457,-0.365450,2.064866,-0.398664,-0.340700,-0.329817,-0.334300,1.866013,-0.277124,-0.493551,-0.072457,-0.336269,-0.048039,-0.115144,-0.162114,-0.113125,0.345098,-0.129342,-0.185442,-0.335648,-0.342545,-0.24172,-0.463142,-0.048039,0.476945,-0.116884,-0.330131,-0.256843,-0.859927,-0.597177,-0.258207,-0.143441,-0.528430,-0.070874,-1.088165,1.755382,-0.090922,-0.505487
17803,-0.884573,-0.843981,-0.039702,-0.138458,-0.098467,-0.675954,0.66262,-1.231119,-0.272276,-0.24457,2.736349,-0.484293,-0.398664,-0.340700,-0.329817,-0.334300,-0.535902,-0.277124,2.026134,-0.072457,-0.336269,-0.048039,-0.115144,-0.162114,-0.113125,0.345098,-0.129342,-0.185442,-0.335648,-0.342545,-0.24172,-0.463142,-0.048039,0.476945,-0.116884,-0.330131,-0.256843,-0.859927,-0.597177,-0.258207,-0.143441,-0.528430,-0.070874,-1.088165,-0.569677,-0.090922,1.978290
17804,-0.884573,-0.692996,0.906643,0.116345,0.092932,-0.675954,0.66262,-1.231119,-0.272276,-0.24457,-0.365450,2.064866,-0.398664,-0.340700,-0.329817,-0.334300,-0.535902,-0.277124,2.026134,-0.072457,-0.336269,-0.048039,-0.115144,-0.162114,-0.113125,0.345098,7.731413,-0.185442,-0.335648,-0.342545,-0.24172,-0.463142,-0.048039,-2.096678,8.555456,-0.330131,-0.256843,-0.859927,1.674545,-0.258207,-0.143441,-0.528430,-0.070874,-1.088165,-0.569677,-0.090922,1.978290
17805,-0.884573,-0.335219,0.615404,0.014424,0.188631,-0.675954,0.66262,-1.231119,-0.272276,-0.24457,-0.365450,-0.484293,-0.398664,-0.340700,-0.329817,-0.334300,-0.535902,-0.277124,-0.493551,-0.072457,-0.336269,-0.048039,-0.115144,6.168504,-0.113125,-2.897729,-0.129342,-0.185442,-0.335648,-0.342545,-0.24172,2.159167,-0.048039,0.476945,-0.116884,-0.330131,-0.256843,-0.859927,1.674545,-0.258207,-0.143441,1.892397,-0.070874,0.918978,-0.569677,-0.090922,-0.505487


## Dealing with the class imbalance

In [16]:
y_train_transformed.value_counts(normalize=True)

1    0.83821
0    0.16179
Name: status_group, dtype: float64

It a model which predicts 1 all through, then there is a 84% chance it is always correct. To deal with this class imbalance problem I will use SMOTE function. SMOTE uses a statistical technique which increases the number of cases in the set dataset in a balanced way.

In [17]:
sm = SMOTE(random_state=21)
X_train_re, y_train_re = sm.fit_resample(X_train_encoded, y_train_transformed)
X_train_re2, y_train_re2 = sm.fit_resample(X_train_ordinal_encoded, y_train_transformed)

In [18]:
y_train_re.value_counts()

1    14926
0    14926
Name: status_group, dtype: int64

In [19]:
# exporting the encoded X_train_encoded and y_train_transformed
X_train_re.to_csv('../analysis-dfs/X_train_prepared.csv')
y_train_re.to_csv('../analysis-dfs/y_train_prepared.csv')

In [20]:
# exporting the X_train_ordinal encoded to be used in decision tree modelling
X_train_re2.to_csv('../analysis-dfs/X_train_ordinal_encoded.csv')
y_train_re2.to_csv('../analysis-dfs/y_train_prepared_2.csv')

In [21]:
def data_preparation(set, target):
    # Data conversion 
    # 1. Label encoding permit column
    le = LabelEncoder()
    le.fit(set.permit)
    set.permit = le.transform(set.permit)
  

    # 2. Label encoding the target column
    # Conveting target variable to a binary foramt
    target_transformed = target.to_frame().status_group.apply(lambda x: 1 if x == 'non functional' else 0)


    # Data Scaling
    numerics = ['gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'permit', 'construction_year'] # selecting columns to scale
    numericals = set[numerics]
    # selecting non-numerical dtypes.
    not_numericals = set.drop(numericals, axis=1)
    sc = StandardScaler()
    numericals_scaled = sc.fit_transform(numericals)
    numericals_scaled_df = pd.DataFrame(numericals_scaled, columns=numericals.columns, index=numericals.index)
    # dropping the numerical columns and then adding the new scaled columns
    set_scaled = pd.concat([numericals_scaled_df, not_numericals], axis = 1)

    # Ordinal encoding
    columns = ['basin', 'extraction_type_class', 'management_group', 'payment', 'quality_group', 'quantity_group', 'source_class', 'waterpoint_type_group']
    oe = OrdinalEncoder(cols=columns)
    set_ordinal_encoded = oe.fit_transform(set_scaled)
    set_ordinal_encoded = myScaler(set_ordinal_encoded)

    # 3. Onehot encoding the set
    set_onehotencoded = pd.get_dummies(set, drop_first=True)
    set_onehotencoded = myScaler(set_onehotencoded)


   

   

    return set_ordinal_encoded, set_onehotencoded, target_transformed


In [22]:
X_test_ordinal_encoded, X_test_onehotencoded, y_test_prepared = data_preparation(X_test, y_test)

In [23]:
X_test_ordinal_encoded.head(10)

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,permit,construction_year,basin,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group
0,-0.868886,-1.376148,1.495194,0.03924,0.064196,-0.688817,0.680148,-1.235246,-1.321296,-1.580502,-0.30478,-0.761908,-0.429759,-1.143011,-0.536674,-1.420138
1,0.523195,0.036112,-0.207269,-0.206484,-0.306611,1.341607,-1.470269,0.788112,-0.97005,-0.839423,-0.30478,-0.761908,-0.429759,-0.204935,-0.536674,0.018679
2,-0.443312,1.315915,-1.455503,-0.452208,-0.306611,0.764396,-1.470269,0.799398,-0.618803,-0.098344,-0.30478,-0.761908,0.560075,0.733141,-0.536674,1.457495
3,-0.868886,-0.451021,-1.297104,-0.255629,-0.306611,-0.688817,0.680148,-1.235246,-0.267557,0.642735,-0.30478,-0.761908,-0.429759,0.733141,-0.536674,0.018679
4,-0.868886,-0.823917,1.024272,0.088385,-0.306611,-0.688817,0.680148,-1.235246,-1.321296,-1.580502,-0.30478,-0.761908,1.549909,-1.143011,-0.536674,-1.420138
5,-0.868886,-0.831868,-1.063994,-0.255629,-0.028506,-0.688817,-1.470269,-1.235246,0.083689,-0.098344,-0.30478,-0.26547,-0.429759,-1.143011,-0.536674,1.457495
6,-0.868886,-0.949841,1.001568,0.088385,-0.121208,-0.688817,0.680148,-1.235246,-1.321296,-1.580502,-0.30478,-0.761908,2.539742,-0.204935,-0.536674,-1.420138
7,1.441376,-0.216667,0.32694,-0.206484,-0.399313,-0.688817,0.680148,0.80658,-0.97005,-0.098344,-0.30478,0.230968,0.560075,0.733141,-0.536674,1.457495
8,-0.868886,-0.653582,0.698146,-0.009905,0.064196,-0.688817,-1.470269,-1.235246,-0.97005,0.642735,1.080354,0.230968,-0.429759,-1.143011,1.799328,0.018679
9,-0.081651,0.914239,-0.390329,-0.599642,-0.028506,-0.688817,0.680148,0.797346,0.434936,0.642735,-0.30478,-0.761908,-0.429759,-1.143011,1.799328,0.018679


In [24]:
X_test_onehotencoded.head(10)

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,permit,construction_year,basin_lake nyasa,basin_lake rukwa,basin_lake tanganyika,basin_lake victoria,basin_pangani,basin_rufiji,basin_ruvuma / southern coast,basin_wami / ruvu,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered,management_group_other,management_group_parastatal,management_group_unknown,management_group_user-group,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,quality_group_fluoride,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown,quantity_group_enough,quantity_group_insufficient,quantity_group_seasonal,quantity_group_unknown,source_class_surface,source_class_unknown,waterpoint_type_group_communal standpipe,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,0.0,31.4282,-1.205196,18.0,7.0,1.0,1,0.0,-0.266585,-0.229637,-0.36326,2.051589,-0.385798,-0.357687,-0.342821,-0.34306,1.895778,-0.271071,-0.496063,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,-0.444889,-0.044394,0.476685,-0.116433,-0.335369,-0.253068,-0.85121,1.690373,-0.266867,-0.15724,-0.536415,-0.056188,-1.101655,1.792557,-0.091997,-0.508142
1,893.0,35.214275,-6.352664,13.0,3.0,300.0,0,1972.0,-0.266585,-0.229637,-0.36326,-0.487427,-0.385798,-0.357687,-0.342821,-0.34306,-0.527488,3.689064,-0.496063,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,-0.444889,-0.044394,0.476685,-0.116433,-0.335369,-0.253068,1.174798,-0.591586,-0.266867,-0.15724,-0.536415,-0.056188,0.907725,-0.557862,-0.091997,-0.508142
2,273.0,38.645251,-10.126753,8.0,3.0,215.0,0,1983.0,-0.266585,-0.229637,-0.36326,-0.487427,-0.385798,-0.357687,2.916972,-0.34306,-0.527488,-0.271071,2.015873,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,-0.444889,-0.044394,-2.09782,-0.116433,-0.335369,3.951503,-0.85121,-0.591586,-0.266867,-0.15724,-0.536415,-0.056188,-1.101655,-0.557862,-0.091997,1.967954
3,0.0,33.90834,-9.647828,12.0,3.0,1.0,1,0.0,3.751153,-0.229637,-0.36326,-0.487427,-0.385798,-0.357687,-0.342821,-0.34306,-0.527488,-0.271071,-0.496063,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,-0.444889,-0.044394,0.476685,-0.116433,-0.335369,-0.253068,-0.85121,-0.591586,-0.266867,-0.15724,-0.536415,-0.056188,0.907725,-0.557862,-0.091997,-0.508142
4,0.0,32.908657,-2.629051,19.0,3.0,1.0,1,0.0,-0.266585,-0.229637,-0.36326,2.051589,-0.385798,-0.357687,-0.342821,-0.34306,1.895778,-0.271071,-0.496063,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,-0.444889,-0.044394,-2.09782,8.588638,-0.335369,-0.253068,-0.85121,1.690373,-0.266867,-0.15724,-0.536415,-0.056188,-1.101655,1.792557,-0.091997,-0.508142
5,0.0,32.887342,-8.943008,12.0,6.0,1.0,0,0.0,-0.266585,4.354693,-0.36326,-0.487427,-0.385798,-0.357687,-0.342821,-0.34306,-0.527488,-0.271071,2.015873,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,4.195235,-0.444889,-0.044394,0.476685,-0.116433,-0.335369,-0.253068,-0.85121,1.690373,-0.266867,-0.15724,-0.536415,-0.056188,-1.101655,-0.557862,-0.091997,1.967954
6,0.0,32.571072,-2.697696,19.0,5.0,1.0,1,0.0,-0.266585,-0.229637,-0.36326,2.051589,-0.385798,-0.357687,-0.342821,-0.34306,1.895778,-0.271071,-0.496063,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,-0.444889,-0.044394,-2.09782,-0.116433,2.981786,-0.253068,1.174798,-0.591586,-0.266867,-0.15724,-0.536415,-0.056188,-1.101655,1.792557,-0.091997,-0.508142
7,1482.0,34.53661,-4.737461,13.0,2.0,1.0,1,1990.0,-0.266585,-0.229637,-0.36326,-0.487427,-0.385798,-0.357687,-0.342821,-0.34306,-0.527488,-0.271071,2.015873,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,2.24775,-0.044394,-2.09782,-0.116433,-0.335369,3.951503,-0.85121,-0.591586,-0.266867,-0.15724,-0.536415,-0.056188,-1.101655,-0.557862,-0.091997,1.967954
8,0.0,33.365301,-3.615105,17.0,7.0,1.0,0,0.0,-0.266585,-0.229637,-0.36326,-0.487427,-0.385798,-0.357687,-0.342821,-0.34306,-0.527488,-0.271071,-0.496063,-0.073521,-0.339708,-0.044394,-0.12264,6.222439,-0.110505,-2.971081,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,2.24775,-0.044394,0.476685,-0.116433,-0.335369,-0.253068,-0.85121,1.690373,-0.266867,-0.15724,1.86423,-0.056188,0.907725,-0.557862,-0.091997,-0.508142
9,505.0,37.568415,-6.906156,5.0,6.0,1.0,1,1981.0,-0.266585,-0.229637,-0.36326,-0.487427,-0.385798,-0.357687,-0.342821,2.914942,-0.527488,-0.271071,-0.496063,-0.073521,-0.339708,-0.044394,-0.12264,-0.160709,-0.110505,0.336578,-0.128037,-0.181806,-0.339708,-0.336578,-0.238366,-0.444889,-0.044394,0.476685,-0.116433,-0.335369,-0.253068,-0.85121,1.690373,-0.266867,-0.15724,1.86423,-0.056188,0.907725,-0.557862,-0.091997,-0.508142


In [25]:
X_test_ordinal_encoded.shape

(7626, 16)

In [26]:
X_test_onehotencoded.shape

(7626, 47)

In [27]:
y_test_prepared.shape

(7626,)

In [28]:
y_test_prepared.value_counts()

1    6466
0    1160
Name: status_group, dtype: int64