In [1]:
%load_ext autoreload
%autoreload 2

In [34]:
#import
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report, recall_score

In [35]:
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [36]:
#import customized functions
from src.data_cleaning import cleaning_functions as cfs
from src.data_cleaning import exploration_functions as efs
from src.data_cleaning import processing_functions as pfs

In [37]:
X_train, X_test, y_train, y_test, classes_dict = pfs.processed_dataset()

In [38]:
X_train, encoders = pfs.ohe_all_categorical_features(X_train)

In [39]:
encoders

{'basin': OneHotEncoder(handle_unknown='ignore'),
 'region': OneHotEncoder(handle_unknown='ignore'),
 'lga': OneHotEncoder(handle_unknown='ignore'),
 'public_meeting': OneHotEncoder(handle_unknown='ignore'),
 'scheme_management': OneHotEncoder(handle_unknown='ignore'),
 'permit': OneHotEncoder(handle_unknown='ignore'),
 'extraction_type': OneHotEncoder(handle_unknown='ignore'),
 'management': OneHotEncoder(handle_unknown='ignore'),
 'payment': OneHotEncoder(handle_unknown='ignore'),
 'water_quality': OneHotEncoder(handle_unknown='ignore'),
 'quantity': OneHotEncoder(handle_unknown='ignore'),
 'source': OneHotEncoder(handle_unknown='ignore'),
 'waterpoint_type': OneHotEncoder(handle_unknown='ignore'),
 'district_code': OneHotEncoder(handle_unknown='ignore')}

In [40]:
X_test

Unnamed: 0,gps_height,longitude,latitude,basin,region,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type
9305,1368.000000,37.671011,-3.704747,Pangani,Kilimanjaro,2,Mwanga,80.000000,True,WUA,False,1980,gravity,wua,never pay,unknown,insufficient,spring,communal standpipe
10929,264.000000,39.041461,-10.909507,Ruvuma / Southern Coast,Mtwara,33,Masasi,80.000000,True,VWC,True,2005,gravity,vwc,pay per bucket,soft,enough,spring,communal standpipe
9302,1748.000000,34.727257,-9.057131,Rufiji,Iringa,4,Njombe,120.000000,True,VWC,True,2009,gravity,vwc,pay when scheme fails,soft,enough,spring,communal standpipe
30964,1030.647677,32.447189,-2.420560,Lake Victoria,Mwanza,5,Sengerema,281.600417,True,VWC,False,2000,swn 80,vwc,unknown,soft,dry,shallow well,hand pump
6293,563.000000,36.932211,-7.201306,Wami / Ruvu,Morogoro,1,Kilosa,101.000000,True,VWC,True,2008,swn 80,vwc,pay when scheme fails,salty,insufficient,shallow well,hand pump
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11416,994.585006,33.218035,-4.024680,Internal,Tabora,1,Nzega,360.858364,True,VWC,True,2000,submersible,vwc,pay per bucket,salty,enough,machine dbh,communal standpipe multiple
28124,-22.000000,39.257906,-8.498074,Ruvuma / Southern Coast,Lindi,13,Kilwa,303.000000,False,unknown,True,2002,submersible,private operator,pay per bucket,salty,enough,machine dbh,communal standpipe
49740,1151.643327,33.563301,-2.919703,Lake Victoria,Shinyanga,2,Maswa,405.484383,True,WUG,False,2000,swn 80,wug,never pay,soft,enough,shallow well,hand pump
21872,1238.413084,33.828594,-9.175722,Lake Nyasa,Mbeya,4,Rungwe,266.138968,True,VWC,unknown,2000,gravity,vwc,never pay,soft,enough,river,communal standpipe


In [45]:
def one_hot_encode_test_features(df, name, ohe):
    """This funciton takes in the test dataframe, a feature name and 
    an ohe object and then One hot encodes the feature and adds
    it to the dataframe
    
    Returns the transformed test dataframe
    """
    
    single_feature_df = df[[name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0], index=df.index)
    df = df.drop(name, axis=1)
    df = pd.concat([df, ohe_df], axis=1)
    
    return df

In [30]:
X_test.dtypes

gps_height           float64
longitude            float64
latitude             float64
basin                 object
region                object
district_code          int64
lga                   object
population           float64
public_meeting        object
scheme_management     object
permit                object
construction_year      int64
extraction_type       object
management            object
payment               object
water_quality         object
quantity              object
source                object
waterpoint_type       object
dtype: object

In [34]:
#X_test = one_hot_encode_features_for_test(X_test, 'basin', encoders['basin'])

In [42]:
def ohe_all_X_test_features(X_test, encoders):
    
    X_test['district_code'] = X_test['district_code'].astype(str)
    
    for key in encoders:
        X_test = one_hot_encode_test_features(X_test, key, encoders[key])
    return X_test

In [43]:
X_test = ohe_all_X_test_features(X_test, encoders)

In [44]:
X_test

Unnamed: 0,gps_height,longitude,latitude,population,construction_year,Internal,Lake Nyasa,Lake Rukwa,Lake Tanganyika,Lake Victoria,...,5,53,6,60,62,63,67,7,8,80
9305,1368.000000,37.671011,-3.704747,80.000000,1980,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10929,264.000000,39.041461,-10.909507,80.000000,2005,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9302,1748.000000,34.727257,-9.057131,120.000000,2009,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30964,1030.647677,32.447189,-2.420560,281.600417,2000,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6293,563.000000,36.932211,-7.201306,101.000000,2008,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11416,994.585006,33.218035,-4.024680,360.858364,2000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28124,-22.000000,39.257906,-8.498074,303.000000,2002,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49740,1151.643327,33.563301,-2.919703,405.484383,2000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21872,1238.413084,33.828594,-9.175722,266.138968,2000,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
X_test

Unnamed: 0,gps_height,longitude,latitude,basin,region,lga,population,public_meeting,scheme_management,permit,...,5,53,6,60,62,63,67,7,8,80
9305,1368.000000,37.671011,-3.704747,Pangani,Kilimanjaro,Mwanga,80.000000,True,WUA,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10929,264.000000,39.041461,-10.909507,Ruvuma / Southern Coast,Mtwara,Masasi,80.000000,True,VWC,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9302,1748.000000,34.727257,-9.057131,Rufiji,Iringa,Njombe,120.000000,True,VWC,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30964,1030.647677,32.447189,-2.420560,Lake Victoria,Mwanza,Sengerema,281.600417,True,VWC,False,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6293,563.000000,36.932211,-7.201306,Wami / Ruvu,Morogoro,Kilosa,101.000000,True,VWC,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11416,994.585006,33.218035,-4.024680,Internal,Tabora,Nzega,360.858364,True,VWC,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28124,-22.000000,39.257906,-8.498074,Ruvuma / Southern Coast,Lindi,Kilwa,303.000000,False,unknown,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49740,1151.643327,33.563301,-2.919703,Lake Victoria,Shinyanga,Maswa,405.484383,True,WUG,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21872,1238.413084,33.828594,-9.175722,Lake Nyasa,Mbeya,Rungwe,266.138968,True,VWC,unknown,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
df['installer'].value_counts()[:50]

DWE                           13077
unknown                        2703
Government                     1370
RWE                             917
Commu                           801
DANIDA                          798
KKKT                            671
Hesawa                          658
0                               579
TCRS                            538
Central government              487
CES                             456
DANID                           433
HESAWA                          423
Community                       402
District Council                402
World vision                    318
LGA                             313
WEDECO                          304
TASAF                           297
District council                292
Gover                           291
AMREF                           260
TWESA                           239
WU                              235
Dmdd                            209
ACRA                            208
World Vision                

In [21]:
df.installer.value_counts()

DWE                 13077
unknown              2703
Government           1370
RWE                   917
Commu                 801
                    ...  
DANIDS                  1
KOYI                    1
Mdala Contractor        1
O                       1
Seleman Masoud          1
Name: installer, Length: 1867, dtype: int64

AttributeError: 'DataFrame' object has no attribute 'profile_report'