In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report
import numpy as np
from numba import jit, cuda 
from timeit import default_timer as timer

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('train_data.csv')

df.head()

In [None]:
X = df.drop('default payment next month',axis=1)
y = df['default payment next month']

# TTS

In [None]:
# train test split .82
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
# fit a Random Forest Model
rfc_baseline = RandomForestClassifier(max_depth=50, max_features=10,n_estimators=200)
rfc_baseline.fit(X_train, y_train)

In [None]:
y_preds = rfc_baseline.predict(X_test)

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
X = df.drop('default payment next month',axis=1)
y = df['default payment next month']

In [None]:
drop_columns = ['ID']
for var in drop_columns:
    X.drop(columns=var,axis=1,inplace=True)

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
rfc_drop = RandomForestClassifier(max_depth=50, max_features=10,n_estimators=200)
rfc_drop.fit(X_train, y_train)

In [None]:
y_preds = rfc_drop.predict(X_test)

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
X_train.head()

In [None]:
def create_plot_of_feature_importances(model, X):
    ''' 
    Inputs: 
    
    model: A trained ensemble model instance
    X: a dataframe of the features used to train the model
    '''
    
    feat_importances = model.feature_importances_

    features_and_importances = zip(X.columns, feat_importances)
    features_and_importances = sorted(features_and_importances, 
                                     key = lambda x: x[1], reverse=True)
    
    features = [i[0] for i in features_and_importances]
    importances = [i[1] for i in features_and_importances]
    
    plt.figure(figsize=(10, 6))
    plt.barh(features, importances)
    plt.gca().invert_yaxis()
    plt.title('Feature Importances')
    plt.xlabel('importance')

In [None]:
create_plot_of_feature_importances(rfc_drop, X_train)

# GridSearchCV Section

In [None]:
rfc = RandomForestClassifier()

## Dictionary of hyper parameters
rf_params = {'max_depth': [10, 15, 20],
            'max_features': ['auto', 'sqrt', 5],
            'min_impurity_split': [1e-7, .01]}

In [None]:
## Create a Gridsearch Object
rfc_grid = GridSearchCV(rfc, rf_params, cv = 5, scoring = 'f1')

In [None]:
## Fit Gridsearch to data
rfc_grid.fit(X_train, y_train)

In [None]:
rfc_grid.best_params_

In [None]:
rf_grid.best_score_

In [None]:
m_best = rfc_grid.best_estimator_

In [None]:
y_hat = m_best.predict(X_test)

In [None]:
print(classification_report(y_test, y_hat))

In [None]:
rf_params = {'max_depth': [30, 50, 70],
            'max_features': [10, 15, 20],
            'n_estimators': [200]}

In [None]:
rfc = RandomForestClassifier()
rfc_grid2 = GridSearchCV(rfc, rf_params, cv = 5, scoring = 'f1')

In [None]:
# normal function to run on cpu 
def func(a):                                 
    a.fit(X_train, y_train)
    return a

In [None]:
# function optimized to run on gpu  
@jit                           
def func2(a): 
    a.fit(X_train, y_train)
    return a

In [None]:
if __name__=="__main__": 
    start = timer() 
    gpu = func2(rfc_grid2) 
    print("with GPU:", timer()-start) 

In [None]:
gpu.best_params_

In [None]:
gpu.best_score_

In [None]:
m_best = gpu.best_estimator_

In [None]:
y_hat = m_best.predict(X_test)

In [None]:
print(classification_report(y_test, y_hat))

In [None]:
create_plot_of_feature_importances(gpu, X_train)

In [None]:
from sklearn import metrics
sorted(metrics.SCORERS.keys())

# Test Model for Submission

In [None]:
df_pred = pd.read_csv('test_features.csv')

In [None]:
df_pred.head()

In [None]:
X_pred = df_pred

In [None]:
X.head()

In [None]:
drop_columns = ['ID','Unnamed: 0']
for var in drop_columns:
    X_pred.drop(columns=var,axis=1,inplace=True)

In [None]:
full_fit = rfc_drop.fit(X,y)

In [None]:
predict = full_fit.predict(X_pred)

In [None]:
df = pd.DataFrame(predict)

In [None]:
df[0].value_counts()

In [7]:
df.to_csv('credit_default_preds_kktcad.csv', index=False)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6001 entries, 0 to 6000
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       6001 non-null   int64
dtypes: int64(1)
memory usage: 47.0 KB


In [3]:
df = pd.read_csv('credit_default_preds_kktcad.csv')

In [5]:
df.drop('Unnamed: 0',axis=1,inplace=True)