In [1]:
# Imports

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Obtain csv files and load them into the notebook

In [4]:
df = pd.read_csv("data/test_set_values.csv")
df

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14845,39307,0.0,2011-02-24,Danida,34,Da,38.852669,-6.582841,Kwambwezi,0,...,never pay,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe
14846,18990,1000.0,2011-03-21,Hiap,0,HIAP,37.451633,-5.350428,Bonde La Mkondoa,0,...,annually,salty,salty,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump
14847,28749,0.0,2013-03-04,,1476,,34.739804,-4.585587,Bwawani,0,...,never pay,soft,good,insufficient,insufficient,dam,dam,surface,communal standpipe,communal standpipe
14848,33492,0.0,2013-02-18,Germany,998,DWE,35.432732,-10.584159,Kwa John,0,...,never pay,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     14850 non-null  int64  
 1   amount_tsh             14850 non-null  float64
 2   date_recorded          14850 non-null  object 
 3   funder                 13981 non-null  object 
 4   gps_height             14850 non-null  int64  
 5   installer              13973 non-null  object 
 6   longitude              14850 non-null  float64
 7   latitude               14850 non-null  float64
 8   wpt_name               14850 non-null  object 
 9   num_private            14850 non-null  int64  
 10  basin                  14850 non-null  object 
 11  subvillage             14751 non-null  object 
 12  region                 14850 non-null  object 
 13  region_code            14850 non-null  int64  
 14  district_code          14850 non-null  int64  
 15  lg

In [6]:
# Create a dataframe with just numerical columns
# Numerical columns

numerical_columns = df.select_dtypes("number")
numerical_columns

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000
2,17168,0.0,1567,34.767863,-5.004344,0,13,2,500,2010
3,45559,0.0,267,38.058046,-9.418672,0,80,43,250,1987
4,49871,500.0,1260,35.006123,-10.950412,0,10,3,60,2000
...,...,...,...,...,...,...,...,...,...,...
14845,39307,0.0,34,38.852669,-6.582841,0,6,1,20,1988
14846,18990,1000.0,0,37.451633,-5.350428,0,4,7,2960,1994
14847,28749,0.0,1476,34.739804,-4.585587,0,13,2,200,2010
14848,33492,0.0,998,35.432732,-10.584159,0,10,2,150,2009


In [7]:
# Replace the zero values with the column median

gps_height_median = numerical_columns['gps_height']
gps_height_median.replace(to_replace = 0, value = gps_height_median.median(), inplace=True)

In [8]:
# Replace the zero values with the column median

construction_year_median = numerical_columns['construction_year']
construction_year_median.replace(to_replace = 0, value = construction_year_median.median(), inplace=True)

In [9]:
# Add basin column
df_basin = pd.concat([numerical_columns, df['basin']], axis=1)
df_basin.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,basin
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,Internal
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,Pangani
2,17168,0.0,1567,34.767863,-5.004344,0,13,2,500,2010,Internal
3,45559,0.0,267,38.058046,-9.418672,0,80,43,250,1987,Ruvuma / Southern Coast
4,49871,500.0,1260,35.006123,-10.950412,0,10,3,60,2000,Ruvuma / Southern Coast


In [10]:
# Dummy code "basin"

df_dummy_basin = pd.get_dummies(df_basin, drop_first=True)
print('Shape before dummy coding: ', df_basin.shape)
print('Shape after dummy coding: ', df_dummy_basin.shape)

Shape before dummy coding:  (14850, 11)
Shape after dummy coding:  (14850, 18)


In [11]:
# Add "region" column

df_region = pd.concat([df_dummy_basin, df['region']], axis=1)
df_region.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,0,0,0,0,0,0,0,0,Manyara
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,0,0,0,0,1,0,0,0,Arusha
2,17168,0.0,1567,34.767863,-5.004344,0,13,2,500,2010,0,0,0,0,0,0,0,0,Singida
3,45559,0.0,267,38.058046,-9.418672,0,80,43,250,1987,0,0,0,0,0,0,1,0,Lindi
4,49871,500.0,1260,35.006123,-10.950412,0,10,3,60,2000,0,0,0,0,0,0,1,0,Ruvuma


In [12]:
# Dummy code "region"

df_dummy_region = pd.get_dummies(df_region, drop_first=True)

print('Shape before dummy coding: ', df_dummy_basin.shape)

print('Shape after dummy coding: ', df_dummy_region.shape)

Shape before dummy coding:  (14850, 18)
Shape after dummy coding:  (14850, 38)


In [13]:
# Add "quality_group" column

df_quality_group = pd.concat([df_dummy_region, df["quality_group"]], axis=1)
df_quality_group.head(2)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,region_Mtwara,region_Mwanza,region_Pwani,region_Rukwa,region_Ruvuma,region_Shinyanga,region_Singida,region_Tabora,region_Tanga,quality_group
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,...,0,0,0,0,0,0,0,0,0,good
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,...,0,0,0,0,0,0,0,0,0,good


In [14]:
# Dummy code "quality_group"

df_dummy_quality_group = pd.get_dummies(df_quality_group, drop_first=True)

print('Shape before dummy coding: ', df_dummy_region.shape)
print('Shape after dummy coding: ', df_dummy_quality_group.shape)

Shape before dummy coding:  (14850, 38)
Shape after dummy coding:  (14850, 43)


In [15]:
df_dummy_quality_group.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,region_Ruvuma,region_Shinyanga,region_Singida,region_Tabora,region_Tanga,quality_group_fluoride,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,...,0,0,0,0,0,0,1,0,0,0
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,...,0,0,0,0,0,0,1,0,0,0
2,17168,0.0,1567,34.767863,-5.004344,0,13,2,500,2010,...,0,0,1,0,0,0,1,0,0,0
3,45559,0.0,267,38.058046,-9.418672,0,80,43,250,1987,...,0,0,0,0,0,0,1,0,0,0
4,49871,500.0,1260,35.006123,-10.950412,0,10,3,60,2000,...,1,0,0,0,0,0,1,0,0,0


In [16]:
# Add "extraction_type_class" column

df_extraction_type_class = pd.concat([df_dummy_quality_group, df["extraction_type_class"]], axis=1)
df_extraction_type_class

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,region_Shinyanga,region_Singida,region_Tabora,region_Tanga,quality_group_fluoride,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown,extraction_type_class
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,...,0,0,0,0,0,1,0,0,0,other
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,...,0,0,0,0,0,1,0,0,0,gravity
2,17168,0.0,1567,34.767863,-5.004344,0,13,2,500,2010,...,0,1,0,0,0,1,0,0,0,other
3,45559,0.0,267,38.058046,-9.418672,0,80,43,250,1987,...,0,0,0,0,0,1,0,0,0,other
4,49871,500.0,1260,35.006123,-10.950412,0,10,3,60,2000,...,0,0,0,0,0,1,0,0,0,gravity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14845,39307,0.0,34,38.852669,-6.582841,0,6,1,20,1988,...,0,0,0,0,0,1,0,0,0,motorpump
14846,18990,1000.0,344,37.451633,-5.350428,0,4,7,2960,1994,...,0,0,0,1,0,0,0,1,0,handpump
14847,28749,0.0,1476,34.739804,-4.585587,0,13,2,200,2010,...,0,1,0,0,0,1,0,0,0,gravity
14848,33492,0.0,998,35.432732,-10.584159,0,10,2,150,2009,...,0,0,0,0,0,1,0,0,0,gravity


In [17]:
# Dummy code "extraction_type_class"

df_dummy_extraction_type_class = pd.get_dummies(df_extraction_type_class, drop_first=True)

print('Shape before dummy coding: ', df_dummy_quality_group.shape)
print('Shape after dummy coding: ', df_dummy_extraction_type_class.shape)

Shape before dummy coding:  (14850, 43)
Shape after dummy coding:  (14850, 49)


In [18]:
# Check new dataframe looks as is expected

df_dummy_extraction_type_class.head(2)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,...,1,0,0,0,0,0,1,0,0,0
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,...,1,0,0,0,0,0,0,0,0,0


In [19]:
df_dummy_extraction_type_class.head(2)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,...,1,0,0,0,0,0,1,0,0,0
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,...,1,0,0,0,0,0,0,0,0,0


In [21]:
# save to csv file 
df_dummy_extraction_type_class.to_csv("data/test_df.csv", index=False)