In [1]:
import pandas as pd
import numpy as np

# from cleaning import bgm_encoder

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import mean_squared_error, classification_report, f1_score, accuracy_score, roc_curve, roc_auc_score, mean_absolute_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.2f}'.format

# %load_ext autoreload  # python's autoreload function.  updating my module does not require reimporting in jupyter
# %autoreload 2df_trainset_values = 'data/trainset_values.csv'

In [2]:
df_trainset_values = 'data/trainset_values.csv'
df_trainset_labels = 'data/trainset_labels.csv'
#testset_values = 'data/testset_values.csv' --> ignoring this for now as this is for the competition

df_X = pd.read_csv(df_trainset_values)
df_y = pd.read_csv(df_trainset_labels)
#df_testset_values = pd.read_csv(testset_values)

In [3]:
print(df_X.shape)
print(df_y.shape)

(59400, 40)
(59400, 2)


### Data Cleaning

In [4]:
df_X.isnull().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [5]:
df_X_numeric = df_X.select_dtypes(exclude='object') # .columns.sort_values().tolist()

In [6]:
#df_X_nonnumeric = df_X.select_dtypes('object') # .columns.sort_values().tolist()

#df_X['management'].value_counts().sort_index()

In [7]:
drop_columns = ['payment', 'payment_type','quality_group','quantity_group','waterpoint_type_group','source_type','source_class','public_meeting', 'recorded_by','num_private','permit','scheme_management','scheme_name','subvillage']
print(len(drop_columns))
print(drop_columns)

for col in drop_columns:
    df_X.drop(columns=col,axis=1,inplace=True)

14
['payment', 'payment_type', 'quality_group', 'quantity_group', 'waterpoint_type_group', 'source_type', 'source_class', 'public_meeting', 'recorded_by', 'num_private', 'permit', 'scheme_management', 'scheme_name', 'subvillage']


In [20]:
drop_candidates = df_X.select_dtypes('object').columns
print(len(drop_candidates))
print(drop_candidates)

encode_columns = ['funder', 'installer','management','management_group','extraction_type_group','extraction_type_class']

16
Index(['funder', 'installer', 'wpt_name', 'basin', 'region', 'lga', 'ward',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'water_quality', 'quantity', 'source',
       'waterpoint_type'],
      dtype='object')


In [9]:
#df_X['extraction_type_class'].value_counts().sort_index()

## <mark style='background-color: blue'>Imputing and Encoding for Categorical</mark>

In [10]:
def impute_cat_unknown(df, col):
    '''
    Impute the value 'Unknown' for df column with na's.
    Input: df = dataframe, col = list of columns
    Output: imputed df
    '''
    for x in col:
        df[x].fillna('Unknown',inplace=True)
    return df
def impute_cat_common(df, col):
    '''
    Impute the most common value for df column with na's.
    Input: df = dataframe, col = list of columns
    Output: imputed df
    '''
    for x in col:
        df[x].apply(lambda x: x.fillna(x.value_counts().index[0]))
    return df

In [11]:
df_X = impute_cat_unknown(df_X,['installer','funder'])

In [12]:
df_X.isnull().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
basin                    0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
water_quality            0
quantity                 0
source                   0
waterpoint_type          0
dtype: int64

In [13]:
# encoding y before TTS
from sklearn.preprocessing import LabelEncoder

In [14]:
df_y['status_group'] = df_y['status_group'].apply(lambda x: x.replace(' ','_'))
df_y['status_group'].value_counts()

functional                 32259
non_functional             22824
functional_needs_repair     4317
Name: status_group, dtype: int64

In [15]:
%%time

# BMG Encoder
def bgm_encoder(element):
    if element == 'functional':
        return 0
    elif element == 'functional_needs_repair':
        return 1
    else:
        return 2

df_y['status_group'] = df_y['status_group'].apply(bgm_encoder)
df_y['status_group'].value_counts()
# 0 = functional
# 1 = functional_needs_repair
# 2 = non_functional

CPU times: user 26.8 ms, sys: 1.9 ms, total: 28.7 ms
Wall time: 27.7 ms


0    32259
2    22824
1     4317
Name: status_group, dtype: int64

In [16]:
df_X['date_recorded'] = [pd.to_datetime(x).value for x in df_X.date_recorded]

In [17]:
df_X_nonnumeric = df_X.select_dtypes('object')

In [18]:
df_X_nonnumeric.columns

Index(['funder', 'installer', 'wpt_name', 'basin', 'region', 'lga', 'ward',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'water_quality', 'quantity', 'source',
       'waterpoint_type'],
      dtype='object')

In [19]:
print(encode_columns)

['date_recorded', 'funder', 'installer', 'management', 'management_group', 'extraction_type_group', 'extraction_type_class']


In [None]:
X = df_X_numeric
y = df_y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X_numeric, y, test_size=.2, random_state=42)
print('X_train: {}'.format(len(X_train)))
print('y_train: {}'.format(len(y_train)))
print('X_test: {}'.format(len(X_test)))
print('y_test: {}'.format(len(y_test)))

In [None]:
dtc = DecisionTreeClassifier(max_depth=10, random_state=42)
dtc.fit(X_train, y_train)

In [None]:
preds_dtc_train = dtc.predict(X_train)

In [None]:
dtc_train_class_report = classification_report(y_train['status_group'], preds_dtc_train[:,1])
print(dtc_train_class_report)

In [None]:
preds_dtc_test = dtc.predict(X_test)

In [None]:
dtc_test_class_report = classification_report(y_test['status_group'], preds_dtc_test[:,1])
print(dtc_test_class_report)

In [None]:
# fit a baseline Random Forest Model
rf_params = {'max_depth': 10,
            'criterion'='gini',
            'max_features': 5,
            'n_estimators'=200,
             'min_samples_split'=2,
             'min_samples_leaf'=1,
             'max_features'='auto'
            }
rfc_baseline = RandomForestClassifier(rf_params)
rfc_baseline.fit(X_train, y_train)

In [None]:
# MVP
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)
preds_dtc_train = dtc.predict(X_test)

In [None]:
# Model Eval
print('Mean Squared Error:', mean_squared_error(y_test, preds_dtc_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, preds_dtc_test))
print('R-squared:', r2_score(y_test, preds_dtc_test))

#### Evaluate metrics in this cell

In [None]:
# Possible scaling

In [None]:
# GridSearch for hyoerparameter testing
rfc = RandomForestClassifier(random_state=42)
param_grid = {'max_depth':[7,9,20],
             'n_estimators':[50,100,150],
             'min_samples_split':[1,2,5]}
cv_rfc = GridSearchCV(rfc, param_grid, cv=5) 

In [None]:
cv_rfc.fit(X,y)

In [None]:
preds_rfc_test = cv_rfc.predict(X_test_sc) # predictions

In [None]:
# Model Eval
print('Mean Squared Error:', mean_squared_error(y_test, preds_rfc_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, preds_rfc_test))
print('R-squared:', r2_score(y_test, preds_rfc_test))

In [None]:
# Coonfusion Matrix
cm_rfc = confusion_matrix(y_test, preds_rfc_test)
print(cm_rfc)
tn = cm_rfc[0,0]
tp = cm_rfc[1,1]
fp = cm_rfc[0,1]
fn = cm_rfc[1,0]
sns.heatmap(cm_rfc, cmap='coolwarm', annot=True)
plt.xlabel('predictions')
plt.ylabel('actuals')
plt.show()

precision = tp/(tp+fp)
print('Precision: {}'.format(precision))

# Your code here to calculate recall - how often the model's prediction of 'winner' was correct
recall = tp/ (fp+fn)
print('Recall: {}'.format(recall))

# Your code here to calculate F-1 score
f1 = (2 * precision * recall) / (precision + recall)
print('F-1 Score: {}'.format(f1))