In [34]:
# Importing necessary libraries
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
pd.set_option('display.max_columns', None)

sns.set_style("whitegrid")
%matplotlib inline

# setting font size for all plots
font = {'size'   : 16}

plt.rc('font', **font)

In [35]:
# importing the cleaned X_train dataset and y_train
X_train = pd.read_csv('../analysis-dfs/X_train_cleaned.csv')
X_train.drop('Unnamed: 0', axis=1, inplace=True)

y_train = pd.read_csv('../analysis-dfs/y_train.csv')
y_train.drop('Unnamed: 0', axis=1, inplace=True)

# importing the cleaned X_test dataset and y_test
X_test = pd.read_csv('../analysis-dfs/X_test_cleaned.csv')
y_test = pd.read_csv('../analysis-dfs/y_test.csv')

In [36]:
display(X_train.shape)
X_train.head(10)

(17807, 17)

Unnamed: 0,gps_height,installer,longitude,latitude,basin,region_code,district_code,population,permit,construction_year,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group
0,340,finw,39.483463,-10.60527,ruvuma / southern coast,9,4,300.0,True,1982,submersible,user-group,never pay,good,dry,groundwater,communal standpipe
1,0,unknown,34.081729,-8.789536,rufiji,12,7,1.0,True,unknown,gravity,user-group,pay monthly,good,insufficient,surface,communal standpipe
2,0,unknown,34.294701,-8.701257,rufiji,12,7,1.0,True,unknown,gravity,user-group,never pay,good,seasonal,surface,communal standpipe
3,123,dwe,37.85137,-7.197111,wami / ruvu,5,2,250.0,True,1997,other,user-group,pay monthly,salty,insufficient,groundwater,other
4,1209,dwe,33.591998,-2.129478,lake victoria,20,4,300.0,False,2008,other,user-group,unknown,unknown,unknown,groundwater,other
5,1170,dwe,32.908859,-1.936028,lake victoria,19,1,500.0,True,1996,other,user-group,never pay,good,insufficient,groundwater,other
6,0,dwe,31.47467,-1.471748,lake victoria,18,2,1.0,True,unknown,handpump,user-group,never pay,good,insufficient,groundwater,hand pump
7,0,dwe,34.803164,-2e-08,lake victoria,17,1,1.0,False,unknown,handpump,parastatal,never pay,salty,enough,groundwater,hand pump
8,1043,government,35.078078,-10.79965,ruvuma / southern coast,10,3,50.0,True,1990,gravity,user-group,unknown,good,dry,groundwater,communal standpipe
9,2117,commu,33.933526,-8.957378,rufiji,11,3,1.0,False,1974,gravity,user-group,pay when scheme fails,good,enough,groundwater,communal standpipe


# Data Scaling

All of the features of numerical data type are in a different scale this will hinder the training process of a model.

In [37]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17807 entries, 0 to 17806
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gps_height             17807 non-null  int64  
 1   installer              17807 non-null  object 
 2   longitude              17807 non-null  float64
 3   latitude               17807 non-null  float64
 4   basin                  17807 non-null  object 
 5   region_code            17807 non-null  int64  
 6   district_code          17807 non-null  int64  
 7   population             17807 non-null  float64
 8   permit                 17807 non-null  bool   
 9   construction_year      17807 non-null  object 
 10  extraction_type_class  17807 non-null  object 
 11  management_group       17807 non-null  object 
 12  payment                17807 non-null  object 
 13  quality_group          17807 non-null  object 
 14  quantity_group         17807 non-null  object 
 15  so

In [38]:
numerics = ['int64', 'float64'] # list of all numerical dtypes
numericals = X_train.select_dtypes(include=numerics)

# selecting non-numerical dtypes.
not_numericals = X_train.select_dtypes(exclude=numerics)

In [42]:
# Using standardscaler I will set all numerical values to be on the same scale.
sc = StandardScaler()
numericals_scaled = sc.fit_transform(numericals)

numericals_scaled_df = pd.DataFrame(numericals_scaled, columns=numericals.columns, index=numericals.index)

# dropping the numerical columns and then adding the new scaled columns
X_train_scaled = pd.concat([not_numericals, numericals_scaled_df], axis = 1)
X_train_scaled

Unnamed: 0,installer,basin,permit,construction_year,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group,gps_height,longitude,latitude,region_code,district_code,population
0,finw,ruvuma / southern coast,True,1982,submersible,user-group,never pay,good,dry,groundwater,communal standpipe,-0.356274,1.723728,-1.640197,-0.393261,-0.194167,1.366671
1,unknown,rufiji,True,unknown,gravity,user-group,pay monthly,good,insufficient,surface,communal standpipe,-0.884573,-0.348901,-1.036569,-0.240379,0.092932,-0.675954
2,unknown,rufiji,True,unknown,gravity,user-group,never pay,good,seasonal,surface,communal standpipe,-0.884573,-0.267184,-1.007222,-0.240379,0.092932,-0.675954
3,dwe,wami / ruvu,True,1997,other,user-group,pay monthly,salty,insufficient,groundwater,other,-0.693453,1.097498,-0.507180,-0.597103,-0.385566,1.025095
4,dwe,lake victoria,False,2008,other,user-group,unknown,unknown,unknown,groundwater,other,0.993994,-0.536809,1.177516,0.167305,-0.194167,1.366671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,sengerema water department,lake victoria,True,unknown,handpump,user-group,never pay,good,dry,groundwater,hand pump,-0.884573,-0.834161,1.045397,0.116345,-0.098467,-0.675954
17803,halmashauri ya wilaya sikonge,lake tanganyika,True,unknown,other,user-group,never pay,good,dry,groundwater,other,-0.884573,-0.843981,-0.039702,-0.138458,-0.098467,-0.675954
17804,villag,lake victoria,True,unknown,other,user-group,other,milky,insufficient,groundwater,other,-0.884573,-0.692996,0.906643,0.116345,0.092932,-0.675954
17805,rc church,internal,True,unknown,gravity,parastatal,unknown,good,insufficient,surface,communal standpipe,-0.884573,-0.335219,0.615404,0.014424,0.188631,-0.675954


# Data Conversion

#### The permit column 
The permit column contains boolean information which can also be interpreted in a binary format.

In [43]:
# Checking values before encoding
X_train_scaled.permit.value_counts()

True     12374
False     5433
Name: permit, dtype: int64

In [44]:
# Using label encoder to transform the permit column
le = LabelEncoder()
le.fit(X_train_scaled.permit)
X_train_scaled.permit = le.transform(X_train_scaled.permit)
X_train_scaled.permit.value_counts()

1    12374
0     5433
Name: permit, dtype: int64

In [46]:
#  reviweing the new dataset
X_train_scaled.head(10)

Unnamed: 0,installer,basin,permit,construction_year,extraction_type_class,management_group,payment,quality_group,quantity_group,source_class,waterpoint_type_group,gps_height,longitude,latitude,region_code,district_code,population
0,finw,ruvuma / southern coast,1,1982,submersible,user-group,never pay,good,dry,groundwater,communal standpipe,-0.356274,1.723728,-1.640197,-0.393261,-0.194167,1.366671
1,unknown,rufiji,1,unknown,gravity,user-group,pay monthly,good,insufficient,surface,communal standpipe,-0.884573,-0.348901,-1.036569,-0.240379,0.092932,-0.675954
2,unknown,rufiji,1,unknown,gravity,user-group,never pay,good,seasonal,surface,communal standpipe,-0.884573,-0.267184,-1.007222,-0.240379,0.092932,-0.675954
3,dwe,wami / ruvu,1,1997,other,user-group,pay monthly,salty,insufficient,groundwater,other,-0.693453,1.097498,-0.50718,-0.597103,-0.385566,1.025095
4,dwe,lake victoria,0,2008,other,user-group,unknown,unknown,unknown,groundwater,other,0.993994,-0.536809,1.177516,0.167305,-0.194167,1.366671
5,dwe,lake victoria,1,1996,other,user-group,never pay,good,insufficient,groundwater,other,0.933395,-0.798928,1.241827,0.116345,-0.481266,2.732976
6,dwe,lake victoria,1,unknown,handpump,user-group,never pay,good,insufficient,groundwater,hand pump,-0.884573,-1.349221,1.396173,0.065384,-0.385566,-0.675954
7,dwe,lake victoria,0,unknown,handpump,parastatal,never pay,salty,enough,groundwater,hand pump,-0.884573,-0.072088,1.885445,0.014424,-0.481266,-0.675954
8,government,ruvuma / southern coast,1,1990,gravity,user-group,unknown,good,dry,groundwater,communal standpipe,0.73606,0.033395,-1.704817,-0.3423,-0.289867,-0.341209
9,commu,rufiji,0,1974,gravity,user-group,pay when scheme fails,good,enough,groundwater,communal standpipe,2.404861,-0.405766,-1.092367,-0.29134,-0.289867,-0.675954
