In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import mean_squared_error, classification_report, f1_score, accuracy_score, roc_curve, roc_auc_score, mean_absolute_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_features = 'data/df_X.csv'
df_independent = 'data/df_y.csv'

df_X = pd.read_csv(df_features)
df_y = pd.read_csv(df_independent)

In [3]:
print(df_X.shape)
print(df_y.shape)

(55083, 27)
(55083, 2)


## Label Encoder

Converting the independent variable to numeric for modeling

In [4]:
from sklearn.preprocessing import LabelEncoder

# BMG Encoder
def bgm_encoder(element):
    if element == 'functional':
        return 0
    elif element == 'non_functional':
        return 1
    else:
        return 2

df_y['status_group'] = df_y['status_group'].apply(bgm_encoder)
df_y['status_group'].value_counts()
# 0 = functional
# 1 = non_functional

0    32259
1    22824
Name: status_group, dtype: int64

## One-Hot Encoding

Now that there are no missing values, convert all of the categorical features into numbers.

In [5]:
encode_columns = ['funder', 'installer','management','management_group','extraction_type_group','extraction_type_class']
df_X_categorical = df_X[encode_columns]

#### The following code was borrowed from our Seattle class lecture.  I tried to rewrite it, but it'sfunctional and efficient.  I'm going to rewrite this and combine with my other encoding functions as a class.

In [6]:
def encode_and_concat_feature_train(df_X, feature_name):
    """
    Helper function for transforming training data.  It takes in the full X dataframe and
    feature name, makes a one-hot encoder, and returns the encoder as well as the dataframe
    with that feature transformed into multiple columns of 1s and 0s
    """
    # make a one-hot encoder and fit it to the training data
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    single_feature_df = df_X[[feature_name]]
    ohe.fit(single_feature_df)
    
    # call helper function that actually encodes the feature and concats it
    df_X = encode_and_concat_feature(df_X, feature_name, ohe)
    
    return ohe, df_X

In [7]:
def encode_and_concat_feature(df_X, feature_name, ohe):
    """
    Helper function for transforming a feature into multiple columns of 1s and 0s. Used
    in both training and testing steps.  Takes in the full X dataframe, feature name, 
    and encoder, and returns the dataframe with that feature transformed into multiple
    columns of 1s and 0s
    """
    # create new one-hot encoded df based on the feature
    single_feature_df = df_X[[feature_name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0], index=df_X.index)
    
    # drop the old feature from X and concat the new one-hot encoded df
    df_X = df_X.drop(feature_name, axis=1)
    df_X = pd.concat([df_X, ohe_df], axis=1)
    
    return df_X

In [8]:
encoders = {}

for categorical_feature in encode_columns:
    ohe, df_X = encode_and_concat_feature_train(df_X, categorical_feature)
    encoders[categorical_feature] = ohe

In [9]:
df_X_nonnumeric = df_X.select_dtypes('object')
print(df_X_nonnumeric.columns)
df_X_numeric = df_X.select_dtypes(exclude='object')

X = df_X_numeric
y = df_y

Index(['wpt_name', 'basin', 'region', 'lga', 'ward', 'extraction_type',
       'water_quality', 'quantity', 'source', 'waterpoint_type'],
      dtype='object')


## Train/Test Spkit

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_X_numeric, y, test_size=.2, random_state=42)
print('X_train: {}'.format(len(X_train)))
print('y_train: {}'.format(len(y_train)))
print('X_test: {}'.format(len(X_test)))
print('y_test: {}'.format(len(y_test)))

X_train: 44066
y_train: 44066
X_test: 11017
y_test: 11017


## kNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [13]:
preds_knn = knn.predict(X_train)

In [18]:
knn_train_class_report = classification_report(y_train['status_group'], preds_knn[:, 1])
print(knn_train_class_report)

              precision    recall  f1-score   support

           0       0.74      0.80      0.77     25700
           1       0.68      0.60      0.64     18366

    accuracy                           0.72     44066
   macro avg       0.71      0.70      0.70     44066
weighted avg       0.71      0.72      0.71     44066



In [19]:
preds_knn_test = knn.predict(X_test)

In [21]:
knn_test_class_report = classification_report(y_test['status_group'], preds_knn_test[:, 1])
print(knn_test_class_report)

              precision    recall  f1-score   support

           0       0.62      0.66      0.64      6559
           1       0.44      0.40      0.42      4458

    accuracy                           0.55     11017
   macro avg       0.53      0.53      0.53     11017
weighted avg       0.55      0.55      0.55     11017



# fit a baseline Decision Tree Model

In [22]:
%%time
dtc = DecisionTreeClassifier(max_depth=10)
dtc.fit(X_train, y_train)

CPU times: user 2min 27s, sys: 1.59 s, total: 2min 29s
Wall time: 2min 30s


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [23]:
%%time
preds_dtc_train = dtc.predict(X_train)

CPU times: user 34.3 s, sys: 44.4 s, total: 1min 18s
Wall time: 1min 30s


In [24]:
dtc_train_class_report = classification_report(y_train['status_group'], preds_dtc_train[:, 1])
print(dtc_train_class_report)

              precision    recall  f1-score   support

           0       0.76      0.89      0.82     25700
           1       0.80      0.62      0.70     18366

    accuracy                           0.78     44066
   macro avg       0.78      0.75      0.76     44066
weighted avg       0.78      0.78      0.77     44066



In [25]:
preds_dtc_test = dtc.predict(X_test)

In [26]:
dtc_test_class_report = classification_report(y_test['status_group'], preds_dtc_test[:, 1])
print(dtc_test_class_report)

              precision    recall  f1-score   support

           0       0.76      0.87      0.81      6559
           1       0.76      0.59      0.66      4458

    accuracy                           0.76     11017
   macro avg       0.76      0.73      0.74     11017
weighted avg       0.76      0.76      0.75     11017



# fit a baseline Random Forest Model

In [None]:
%%time
rfc_baseline = RandomForestClassifier(max_depth=30, n_estimators=100)
rfc_baseline.fit(X_train, y_train)

In [None]:
%%time
preds_rfc_baseline_train = rfc_baseline.predict(X_train)

In [None]:
rfc_baseline_train_class_report = classification_report(y_train, preds_rfc_baseline_train)
print(rfc_baseline_train_class_report)

In [None]:
%%time
preds_rfc_baseline_test = rfc_baseline.predict(X_test)

In [None]:
rfc_baseline_test_class_report = classification_report(y_test, preds_rfc_baseline_test)
print(rfc_baseline_test_class_report)

In [None]:
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()
param_grid = {'max_depth':[50],
             'n_estimators':[50],
             }
cv_rfc = GridSearchCV(rfc, param_grid, cv=3) 

In [None]:
cv_rfc.fit(X_train, y_train)

In [None]:
preds_rfc_test = cv_rfc.predict(X_test) # predictions

In [None]:
# Model Eval
print('Mean Squared Error:', mean_squared_error(y_test, preds_rfc_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, preds_rfc_test))


In [None]:
print('R-squared:', r2_score(y_test, preds_rfc_test))

In [None]:
cv_rfc.best_params_

In [None]:
m_best = cv_rfc.best_estimator_
m_best

In [None]:
## Compare accuracy and feature importance of your
## Gridsearch model and your baseline RandomForest model 
y_hat = m_best.predict(X_test)

In [None]:
rfc_grid_test_class_report = classification_report(y_test, y_hat)
print(rfc_grid_test_class_report)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_hat)

In [None]:
print(cm)
tn = cm[0,0]
tp = cm[1,1]
fp = cm[0,1]
fn = cm[1,0]
sns.heatmap(cm, cmap='coolwarm', annot=True)
plt.xlabel('predictions')
plt.ylabel('actuals')
plt.savefig('img/confusion.jpg')
plt.show()


In [None]:
accuracy = ((tp+tn)/(tp+tn+fp+fn))*100
print('Accuracy : {}'.format(accuracy))
recall = (tp/(fp+tp))*100
print('Recall : {}'.format(recall))

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()

In [None]:
xgb.fit(X_train, y_train)

In [None]:
print(xgb)