In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import MissingIndicator, SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

# plot_confusion_matrix is a handy visual tool, added in the latest version of scikit-learn
# if you are running an older version, comment out this line and just use confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve

In [8]:
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')

In [18]:
X_train.shape

(59400, 40)

In [20]:
X_test.shape

(14850, 40)

In [21]:
y_train.shape

(59400, 2)

In [22]:
14850 / (59400 + 14850) # There was an 80/20 split done

0.2

In [9]:
X_train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [11]:
X_train.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [12]:
X_train.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [17]:
y_train['status_group'].value_counts() #May need to use SMOTE for the imbalance

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

# Handle NAs

#### Funder

In [39]:
X_train[X_train['funder'].isna()]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group


In [27]:
X_train['funder'].fillna(value='Unknown', inplace=True)

#### Installer

In [29]:
X_train['installer'].fillna(value='Unknown', inplace=True)

#### Subvillage

In [43]:
X_train[X_train['subvillage'].isna()].iloc[:,:20]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by
342,21127,0.0,2011-03-16,Government Of Tanzania,0,North,36.557631,-6.233394,Kwa Mihinzo,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Sagara,0,True,GeoData Consultants Ltd
360,51558,0.0,2011-03-25,Commu,0,Commu,36.416701,-6.220157,Kwa Emanuel,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Kongwa Urban,0,True,GeoData Consultants Ltd
379,53847,0.0,2011-03-20,World Bank,0,Rhobi,36.729383,-6.084255,Kwa Dimanyi,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Pandambili,0,True,GeoData Consultants Ltd
565,27334,0.0,2011-03-18,World Bank,0,Rhoda,36.696881,-5.993192,Mpande,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Njoge,0,True,GeoData Consultants Ltd
966,17088,0.0,2011-03-11,Water,0,Commu,36.322623,-6.030500,Kwa Charles,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Sejeli,0,True,GeoData Consultants Ltd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59008,16353,0.0,2011-03-23,Commu,0,Commu,36.770490,-6.288555,Kwa Mlima,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Mlali,0,True,GeoData Consultants Ltd
59091,45206,0.0,2011-03-14,Lvia,0,Commu,36.407974,-5.715084,Kwa Emson,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Zoissa,0,True,GeoData Consultants Ltd
59105,12248,0.0,2011-03-19,World Bank,0,Rhobi,36.889359,-5.959966,Kwa Mahimbo,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Pandambili,0,True,GeoData Consultants Ltd
59215,46441,0.0,2011-03-19,World Bank,0,Rhobi,36.854216,-6.010508,Kwa Sila,0,Wami / Ruvu,,Dodoma,1,3,Kongwa,Pandambili,0,True,GeoData Consultants Ltd


In [46]:
subvillage = X_train[X_train['basin'] == 'Wami / Ruvu']

#subvillage.groupby(['basin', 'subvillage']).sum()

In [49]:
subvillage.groupby(['region', 'subvillage']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
region,subvillage,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Dar es Salaam,Amani Gomvu,47940,0.0,-2,39.481425,-6.935378,0,7,3,203,1970
Dar es Salaam,Azimio,71507,100.0,107,39.135178,-6.709691,0,7,1,2569,2010
Dar es Salaam,Basihaya,735,50.0,-19,39.162534,-6.638992,0,7,1,4520,1990
Dar es Salaam,Bondeni,55322,50.0,-19,39.534599,-7.088183,0,7,3,50,2000
Dar es Salaam,Buyuni,49097,50.0,-52,39.521571,-7.128883,0,7,3,50,2000
...,...,...,...,...,...,...,...,...,...,...,...
Tanga,Tingeni,43504,0.0,0,37.294360,-5.698781,0,4,7,1534,1989
Tanga,Tuliani,39780,2400.0,0,74.947131,-10.744890,0,8,14,381,3984
Tanga,Ungulodi,53474,0.0,0,37.151898,-6.019752,0,5,1,800,2010
Tanga,Vulala,71047,0.0,0,37.639017,-5.785670,0,4,7,1009,1992
