In [1]:
# Importing libaries that I will be using

from category_encoders import OneHotEncoder, OrdinalEncoder
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score # k-fold CV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # Hyperparameter tuning
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Data wrangle fuction
def wrangle(fm_path, tv_path=None):
    if tv_path:
        df = pd.merge(pd.read_csv(fm_path, 
                              # Turn these values into NaN    
                              na_values=[0, -2.000000e-08],
                              # Turn this column into datetime
                              parse_dates=['date_recorded']),
                  # Turn the 'id' column into the dataframe index
                  pd.read_csv(tv_path)).set_index('id')
    else:
        df = pd.read_csv(fm_path, 
                     na_values=[0, -2.000000e-08],
                     parse_dates=['date_recorded'],
                     index_col='id')

    # Drop constant columns
    df.drop(columns=['recorded_by'], inplace=True)

    # Create age feature
    df['pump_age'] = df['date_recorded'].dt.year - df['construction_year']
    df.drop(columns='date_recorded', inplace=True)

    # Drop HCCCs
    cutoff = 100
    drop_cols = [col for col in df.select_dtypes('object').columns
              if df[col].nunique() > cutoff]
    df.drop(columns=drop_cols, inplace=True)

    # Drop duplicate columns
    dupe_cols = [col for col in df.head(100).T.duplicated().index
               if df.head(100).T.duplicated()[col]]
    df.drop(columns=dupe_cols, inplace=True)      
    


    return df

In [3]:
# Importing the datasets and using the wrangle function on them
df = wrangle(fm_path='train_features.csv',
             tv_path='train_labels.csv')

X_test = wrangle(fm_path='test_features.csv')

In [4]:
# Splitting into my X matrix and y vector
# We want to predict what water pumps need to be repaired
target = 'status_group'
y = df[target]
X = df.drop(columns = target)

In [5]:
# Creating training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=.2, random_state =42)

In [6]:
# Establishing a baseline
#Checking to see what the current y counts look like.
y.value_counts()

# Turning the y counts into floats(percents)
y.value_counts(normalize=True)

# The baseline will always be the majority in categorical
baseline = y.value_counts(normalize=True).max()

print('Baseline:', baseline)

Baseline: 0.5429828068772491


In [7]:
### FIRST MODEL: Logistic Regression
### TUNING IS FURTHER DOWN

model_lr = make_pipeline(
            OneHotEncoder(use_cat_names=True),
            SimpleImputer(strategy='mean'),
            StandardScaler(),
            LogisticRegression()
            )
model_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['basin', 'region', 'public_meeting',
                                     'scheme_management', 'permit',
                                     'extraction_type', 'extraction_type_group',
                                     'extraction_type_class', 'management',
                                     'management_group', 'payment',
                                     'payment_type', 'water_quality',
                                     'quality_group', 'quantity', 'source',
                                     'source_type', 'source_class',
                                     'waterpoint_type',
                                     'waterpoint_type_group'],
                               use_cat_names=True)),
                ('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [8]:
### SECOND MODEL: Decision tree classifier
### TUNING IS FURTHER DOWN

model_dt = make_pipeline(
            OrdinalEncoder(),
            SimpleImputer(strategy='mean'),
            DecisionTreeClassifier(random_state=42, max_depth=16)
            )
model_dt.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['basin', 'region', 'public_meeting',
                                      'scheme_management', 'permit',
                                      'extraction_type',
                                      'extraction_type_group',
                                      'extraction_type_class', 'management',
                                      'management_group', 'payment',
                                      'payment_type', 'water_quality',
                                      'quality_group', 'quantity', 'source',
                                      'source_type', 'source_class',
                                      'waterpoint_type',
                                      'waterpoin...
communal standpipe             2
communal standpipe multiple    3
improved spring                4
other                          5
cattle trough                  6
dam                            7
NaN                           -

In [9]:
### THIRD MODEL: RandomForest
### TUNING IS FURTHER DOWN

model_rf = make_pipeline(
            OrdinalEncoder(),
            SimpleImputer(strategy='mean'),
            RandomForestClassifier(random_state=42,n_estimators=100,n_jobs=-1, max_depth=20)
            )
model_rf.fit(X_train, y_train)


print('Random Forest training accuracy:', model_rf.score(X_train, y_train))
print('Random Forest validation accuracy:', model_rf.score(X_val, y_val))
print()

Random Forest training accuracy: 0.9488885966066026
Random Forest validation accuracy: 0.8002946127946128



In [10]:
### TUNING RANDOMFOREST
clf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(n_estimators=25, random_state=42)
)

param_grid = {
    'simpleimputer__strategy': ['mean', 'median'],
    'randomforestclassifier__max_depth': range(5,40,5),
    'randomforestclassifier__n_estimators': range(25, 125, 25)
}

model_rfgs = GridSearchCV(
    clf,
    param_grid=param_grid,
    n_jobs=-1,
    cv=5,
    verbose=1
)

model_rfgs.fit(X_train, y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


KeyboardInterrupt: 

In [None]:
# Checking the best parameters
model_rfgs.best_params_

In [None]:
# Checking the best score

In [None]:
model_rfgs.best_score_

In [None]:
### Check Metrics

#Logistic Regresion Metrics

print('Logistic Regresion training accuracy:', model_lr.score(X_train, y_train))
print('Logistic Regresion validation accuracy:', model_lr.score(X_val, y_val))
print()
#Logistic Decision Tree

print('Decision Tree training accuracy:', model_dt.score(X_train, y_train))
print('Decision Tree validation accuracy:', model_dt.score(X_val, y_val))
print()

#LogisticRandom Forest

print('Random Forest training accuracy:', model_rf.score(X_train, y_train))
print('Random Forest validation accuracy:', model_rf.score(X_val, y_val))
print()

In [None]:
# DECICION TREE HYPERPARAMETER TUNING
depths = range(2,38,2)

train_acc = []
val_acc = []

for depth in depths:
  tree_model = make_pipeline(
      OrdinalEncoder(),
      SimpleImputer(strategy='mean'),
      DecisionTreeClassifier(max_depth=depth, random_state=42)
  )

  tree_model.fit(X_train, y_train)

  train_acc.append(tree_model.score(X_train, y_train))
  val_acc.append(tree_model.score(X_val, y_val))

In [None]:
# Creating a plot for the decision tree
plt.plot(depths, train_acc, color='blue', label='training')
plt.plot(depths, val_acc, color='orange', label='validation')

plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.legend();

In [None]:
# RANDOM HYPERPARAMETER TUNING
depths = range(2,38,2)

train_acc = []
val_acc = []

for depth in depths:
  tree_model = make_pipeline(
      OrdinalEncoder(),
      SimpleImputer(strategy='mean'),
      RandomForestClassifier(max_depth=depth, random_state=42,n_estimators=100, n_jobs=-1)
  )

  tree_model.fit(X_train, y_train)

  train_acc.append(tree_model.score(X_train, y_train))
  val_acc.append(tree_model.score(X_val, y_val))

In [None]:
# Creating a plot for the random forest
plt.plot(depths, train_acc, color='blue', label='training')
plt.plot(depths, val_acc, color='orange', label='validation')

plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.legend();

In [None]:
### Explain the decisiontree

coefficients = model_dt.named_steps['decisiontreeclassifier']
features = model_dt.named_steps['ordinalencoder'].get_feature_names()
importances = model_dt.named_steps['decisiontreeclassifier'].feature_importances_

feat_imp = pd.Series(importances, index=features).sort_values()
feat_imp.tail(10).plot(kind='barh')
plt.title('Decision Tree Coefficients');

In [None]:
### Explain the decisiontree

bestimator = model_rfrs.best_estimator_
importances = bestimator.named_steps['randomforestclassifier'].feature_importances_
features = X_train.columns
feat_imp = pd.Series(importances, index=features).sort_values()
feat_imp.tail(10).plot(kind='barh')
plt.xlabel('Reduction in Gini Impurity')

In [None]:
# PREPARE FOR KAGGLE SUBMISSION

submission = pd.DataFrame()
submission['id'] = X_test.index
submission['status_group'] = model_rf.predict(X_test)

submission.to_csv('submission2.csv', index=False)

In [None]:
y_pred = model_rfgs.predict(X_test)
submission = pd.DataFrame({'status_group':y_pred}, index=X_test.index)
datestamp = pd.Timestamp.now().strftime('%Y-%m-%d_%H%M_')
submission.to_csv(f'{datestamp}submission.csv')