In this notebook I'm just playing around to automate importation of X_test after transforming the set by applying the OneHotencoders that were extracted earlier (when we fit them on the training data)

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
#import
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report, recall_score

In [3]:
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
#import customized functions
from src.data_cleaning import cleaning_functions as cfs
from src.data_cleaning import exploration_functions as efs
from src.data_cleaning import processing_functions as pfs

In [5]:
X_train, X_test, y_train, y_test, classes_dict = pfs.processed_dataset()

In [6]:
X_train, encoders = pfs.ohe_all_categorical_features(X_train)

In [7]:
encoders

{'basin': OneHotEncoder(handle_unknown='ignore'),
 'region': OneHotEncoder(handle_unknown='ignore'),
 'lga': OneHotEncoder(handle_unknown='ignore'),
 'public_meeting': OneHotEncoder(handle_unknown='ignore'),
 'scheme_management': OneHotEncoder(handle_unknown='ignore'),
 'permit': OneHotEncoder(handle_unknown='ignore'),
 'extraction_type': OneHotEncoder(handle_unknown='ignore'),
 'management': OneHotEncoder(handle_unknown='ignore'),
 'payment': OneHotEncoder(handle_unknown='ignore'),
 'water_quality': OneHotEncoder(handle_unknown='ignore'),
 'quantity': OneHotEncoder(handle_unknown='ignore'),
 'source': OneHotEncoder(handle_unknown='ignore'),
 'waterpoint_type': OneHotEncoder(handle_unknown='ignore'),
 'district_code': OneHotEncoder(handle_unknown='ignore')}

In [8]:
X_test.head(5)

Unnamed: 0,longitude,latitude,basin,region,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type,gps_height_binned
9305,37.671011,-3.704747,Pangani,Kilimanjaro,2,Mwanga,80.0,True,WUA,False,1980,gravity,wua,never pay,unknown,insufficient,spring,communal standpipe,above surface upto 1750
10929,39.041461,-10.909507,Ruvuma / Southern Coast,Mtwara,33,Masasi,80.0,True,VWC,True,2005,gravity,vwc,pay per bucket,soft,enough,spring,communal standpipe,above surface upto 750
9302,34.727257,-9.057131,Rufiji,Iringa,4,Njombe,120.0,True,VWC,True,2009,gravity,vwc,pay when scheme fails,soft,enough,spring,communal standpipe,above surface upto 1750
30964,32.447189,-2.42056,Lake Victoria,Mwanza,5,Sengerema,281.600417,True,VWC,False,2000,swn 80,vwc,unknown,soft,dry,shallow well,hand pump,above surface up to 1250
6293,36.932211,-7.201306,Wami / Ruvu,Morogoro,1,Kilosa,101.0,True,VWC,True,2008,swn 80,vwc,pay when scheme fails,salty,insufficient,shallow well,hand pump,above surface upto 750


In [9]:
#test on one if it works correctly we can through it in a for loop
#X_test = one_hot_encode_features_for_test(X_test, 'basin', encoders['basin'])

In [10]:
def one_hot_encode_test_features(df, name, ohe):
    """This funciton takes in the test dataframe, a feature name and 
    an ohe object and then One hot encodes the feature and adds
    it to the dataframe
    
    Returns the transformed test dataframe
    """
    
    single_feature_df = df[[name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0], index=df.index)
    df = df.drop(name, axis=1)
    df = pd.concat([df, ohe_df], axis=1)
    
    return df

In [11]:
X_test.dtypes

longitude             float64
latitude              float64
basin                  object
region                 object
district_code           int64
lga                    object
population            float64
public_meeting         object
scheme_management      object
permit                 object
construction_year       int64
extraction_type        object
management             object
payment                object
water_quality          object
quantity               object
source                 object
waterpoint_type        object
gps_height_binned    category
dtype: object

Noticed district_code dtype is int64, will have to convert it to object and encode it

In [12]:
def ohe_all_X_test_features(X_test, encoders):
    
    X_test['district_code'] = X_test['district_code'].astype(str)
    
    for key in encoders:
        X_test = one_hot_encode_test_features(X_test, key, encoders[key])
    return X_test

In [13]:
X_test = ohe_all_X_test_features(X_test, encoders)

In [14]:
X_test

Unnamed: 0,longitude,latitude,population,construction_year,gps_height_binned,Internal,Lake Nyasa,Lake Rukwa,Lake Tanganyika,Lake Victoria,...,5,53,6,60,62,63,67,7,8,80
9305,37.671011,-3.704747,80.000000,1980,above surface upto 1750,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10929,39.041461,-10.909507,80.000000,2005,above surface upto 750,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9302,34.727257,-9.057131,120.000000,2009,above surface upto 1750,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30964,32.447189,-2.420560,281.600417,2000,above surface up to 1250,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6293,36.932211,-7.201306,101.000000,2008,above surface upto 750,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11416,33.218035,-4.024680,360.858364,2000,above surface up to 1250,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28124,39.257906,-8.498074,303.000000,2002,below surface level,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49740,33.563301,-2.919703,405.484383,2000,above surface up to 1250,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21872,33.828594,-9.175722,266.138968,2000,above surface up to 1250,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


That worked, exporting these functions to .py file!!!

One more edit for our dataset (hopefully the last one now, can't keep doing this forever!), decided to bin `gps_height`!

In [15]:
def bin_gps_height(df):
    """This function takes in the dataframe and
    bins the 'gps_height' column
    """
    
    #creating binned column
    df['gps_height_binned'] = pd.cut(df['gps_height'],
                                     bins=[-150,-1,1,750,1250,1750,3000],
                                     labels=['below surface level', 'at surface level', 
                                             'above surface upto 750', 'above surface up to 1250',
                                             'above surface upto 1750', 'above 1750'])
    
    #dropping original column
    df.drop("gps_height", axis=1, inplace=True)
    
    return df    

In [16]:
def bin_gps_for_both_sets(df1, df2):
    """This function bins the 'gps_height' column
    for both train and test set
    """
    
    #using helper function and binning both sets
    df1 = bin_gps_height(df1)
    df2 = bin_gps_height(df2)
    
    df1['gps_height_binned'] = df1['gps_height_binned'].astype('object')
    df2['gps_height_binned'] = df2['gps_height_binned'].astype('object')
    
    return df1, df2