In [None]:
import pandas as pd
import numpy as np

# from cleaning import bgm_encoder

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report, f1_score, accuracy_score, roc_curve, roc_auc_score, mean_absolute_error, r2_score


import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.2f}'.format

# %load_ext autoreload  # python's autoreload function.  updating my module does not require reimporting in jupyter
# %autoreload 2df_trainset_values = 'data/trainset_values.csv'

In [None]:
df_trainset_values = 'data/trainset_values.csv'
df_trainset_labels = 'data/trainset_labels.csv'
#testset_values = 'data/testset_values.csv' --> ignoring this for now as this is for the competition

df_X = pd.read_csv(df_trainset_values)
df_y = pd.read_csv(df_trainset_labels)
#df_testset_values = pd.read_csv(testset_values)

In [None]:
print(df_X.shape)
print(df_y.shape)

### Data Cleaning

In [None]:
df_X.head()

In [None]:
df_X.isnull().sum()

In [None]:
df_X_numeric = df_X.select_dtypes(exclude='object') # .columns.sort_values().tolist()

In [None]:
df_X_nonnumeric = df_X.select_dtypes('object') # .columns.sort_values().tolist()

In [None]:
df_X_nonnumeric.value_counts()

In [None]:
drop_candidates = df_X.select_dtypes(include='object').columns
print(len(drop_candidates))
print(drop_candidates)

drop_columns = ['funder','extraction_type_group', 'extraction_type_class','payment', 'payment_type','quality_group','quantity_group','waterpoint_type_group','source_type','source_class','public_meeting', 'recorded_by','num_private','permit']
print(len(drop_columns))
print(drop_columns)

for col in drop_columns:
    df_X.drop(columns=col,axis=1,inplace=True)

In [None]:
drop_candidates = df_X.select_dtypes('object').columns
print(len(drop_candidates))
print(drop_candidates)

# if the model is too complicated ()
drop_possibile = ['extraction_type_group', 'extraction_type_class','management_group']

In [None]:
# df_impute = df_X.select_dtypes('object').isnull().sum()

In [None]:
drop_impute = ['installer','subvillage','scheme_management','scheme_name']
for col in drop_impute:
    df_X.drop(columns=col,axis=1,inplace=True)

In [None]:
df_X_nonnumeric['management'].value_counts().sort_index()

In [None]:
# encoding y before TTS
from sklearn.preprocessing import LabelEncoder

In [None]:
df_y['status_group'] = df_y['status_group'].apply(lambda x: x.replace(' ','_'))
df_y['status_group'].value_counts()

In [None]:
%%time

# BMG Encoder
def bgm_encoder(element):
    if element == 'functional':
        return 0
    elif element == 'functional_needs_repair':
        return 1
    else:
        return 2

df_y['status_group'] = df_y['status_group'].apply(bgm_encoder)
df_y['status_group'].value_counts()
# 0 = functional
# 1 = functional_needs_repair
# 2 = non_functional

In [None]:
X = df_X_numeric
y = df_y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X_numeric, y, test_size=.2, random_state=42)
print('X_train: {}'.format(len(X_train)))
print('y_train: {}'.format(len(y_train)))
print('X_test: {}'.format(len(X_test)))
print('y_test: {}'.format(len(y_test)))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# print('X_train: {}'.format(len(X_train)))
# print('y_train: {}'.format(len(y_train)))
# print('X_test: {}'.format(len(X_test)))
# print('y_test: {}'.format(len(y_test)))

In [None]:
# fitting a numeric only model

dtc = DecisionTreeClassifier(max_depth=10, random_state=42)
dtc.fit(X_train, y_train)

In [None]:
# Kernel restarting here
preds_dtc_train = dtc.predict(X_train)

In [None]:
preds_dtc_train[:,1]

In [None]:
#create a confusion matrix
dtc_train_confusion_matrix = confusion_matrix(preds_dtc_train[:,1], y_train['status_group'])


In [None]:
print(dtc_train_confusion_matrix)
tn = dtc_train_confusion_matrix[0,0]
tp = dtc_train_confusion_matrix[1,1]
fp = dtc_train_confusion_matrix[0,1]
fn = dtc_train_confusion_matrix[1,0]

sns.heatmap(dtc_train_confusion_matrix, cmap='coolwarm', annot=True)
plt.xlabel('predictions')
plt.ylabel('actuals')
plt.show()

# Precision
precision = tp/(tp+fp)
print('Precision: {}'.format(precision))

# How often the model's prediction of 'winner' was correct
recall = tp/ (fp+fn)
print('Recall: {}'.format(recall))

# F-1 score
f1 = (2 * precision * recall) / (precision + recall)
print('F-1 Score: {}'.format(f1))

In [None]:
# MVP
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)
preds_dtc_train = dtc.predict(X_test)

In [None]:
# Model Eval
print('Mean Squared Error:', mean_squared_error(y_test, preds_dtc_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, preds_dtc_test))
print('R-squared:', r2_score(y_test, preds_dtc_test))

In [None]:
# Coonfusion Matrix
cm_dtc = confusion_matrix(y_test, preds_dtc_test)
print(cm_dtc)
tn = cm_dtc[0,0]
tp = cm_dtc[1,1]
fp = cm_dtc[0,1]
fn = cm_dtc[1,0]
sns.heatmap(cm_dtc, cmap='coolwarm', annot=True)
plt.xlabel('predictions')
plt.ylabel('actuals')
plt.show()

precision = tp/(tp+fp)
print('Precision: {}'.format(precision))

# Your code here to calculate recall - how often the model's prediction of 'winner' was correct
recall = tp/ (fp+fn)
print('Recall: {}'.format(recall))

# Your code here to calculate F-1 score
f1 = (2 * precision * recall) / (precision + recall)
print('F-1 Score: {}'.format(f1))

#### Evaluate metrics in this cell

In [None]:
# Possible scaling

In [None]:
# GridSearch for hyoerparameter testing
rfc = RandomForestClassifier(random_state=42)
param_grid = {'max_depth':[7,9,20],
             'n_estimators':[50,100,150],
             'min_samples_split':[1,2,5]}
cv_rfc = GridSearchCV(rfc, param_grid, cv=5) 

In [None]:
cv_rfc.fit(X,y)

In [None]:
preds_rfc_test = cv_rfc.predict(X_test_sc) # predictions

In [None]:
# Model Eval
print('Mean Squared Error:', mean_squared_error(y_test, preds_rfc_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, preds_rfc_test))
print('R-squared:', r2_score(y_test, preds_rfc_test))

In [None]:
# Coonfusion Matrix
cm_rfc = confusion_matrix(y_test, preds_rfc_test)
print(cm_rfc)
tn = cm_rfc[0,0]
tp = cm_rfc[1,1]
fp = cm_rfc[0,1]
fn = cm_rfc[1,0]
sns.heatmap(cm_rfc, cmap='coolwarm', annot=True)
plt.xlabel('predictions')
plt.ylabel('actuals')
plt.show()

precision = tp/(tp+fp)
print('Precision: {}'.format(precision))

# Your code here to calculate recall - how often the model's prediction of 'winner' was correct
recall = tp/ (fp+fn)
print('Recall: {}'.format(recall))

# Your code here to calculate F-1 score
f1 = (2 * precision * recall) / (precision + recall)
print('F-1 Score: {}'.format(f1))