# Baseline: Balancing and Ensemble learning
Notebook that contains the baseline settings.

### Import Libraries

In [1]:
# Import core libraries
import pandas as pd
import numpy as np

# Parallel computing
from joblib import effective_n_jobs

# Undersampling
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

# Preprocessing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Metrics computation
from IPython.core.display_functions import display
from sklearn.metrics import f1_score, recall_score, accuracy_score, confusion_matrix, precision_score, roc_curve, plot_precision_recall_curve, roc_auc_score, auc, precision_recall_curve, confusion_matrix

# Models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# Cross validation
from sklearn.model_selection import GridSearchCV

# plot
%matplotlib inline
import matplotlib.pyplot as plt

### Define Model Metrics

In [None]:
# Model evaluation metrics
model_eval = {
    'model': [],
    'recall': [],
    'f1_score': [],
    'accuracy_score': [],
    'precision_score': []
}

def add_model_eval(model, recall, f1_score, accuracy_score, precision_score):
    model_eval['model'].append(model)
    model_eval['recall'].append(f'{recall: .2f}')
    model_eval['f1_score'].append(f'{f1_score: .2f}')
    model_eval['accuracy_score'].append(f'{accuracy_score: .2f}')
    model_eval['precision_score'].append(f'{precision_score: .2f}')

def view_models_eval(sort=False):
    eval_df = pd.DataFrame(model_eval)

    if sort:
        eval_df = eval_df.sort_values(by=['recall', 'f1_score'], ascending=[False, False])

    # display(eval_df.style.hide_index)
    display(eval_df)

### Load In Data

In [1]:
# Set our RNG seed for reproducibility.
RANDOM_STATE_SEED = 123
np.random.seed(RANDOM_STATE_SEED)
# Load in data (transactions)
cc_transactions = pd.read_csv("../data/creditcard.csv", index_col = False, delimiter=",")

# Partition data into X and y
X_transactions = cc_transactions.drop(['Class'], axis=1)
y_transactions = cc_transactions['Class']

NameError: name 'pd' is not defined

### Undersampling

In [4]:
# Undersampling
rus = RandomUnderSampler(random_state=42)
rus.fit(X_transactions,
        y_transactions)
X_resampled, y_resampled = rus.fit_resample(X_transactions,
                                            y_transactions)

# Shape of old dataset vs. new dataset
print('Original dataset shape {}'.format(Counter(y_transactions)))
print('Resampled dataset shape {}'.format(Counter(y_resampled)))

Original dataset shape Counter({0: 284315, 1: 492})
Resampled dataset shape Counter({0: 492, 1: 492})


#### Standardisation

In [5]:
# Transforming the features 'Time' and 'Amount' and applying this to all partitions of the dataset
std_feat = ['Time',
            'Amount']
std_pipeline = Pipeline([('std_scaler',
                          StandardScaler())])

full_pipeline = ColumnTransformer([
    ('std_feat',
     std_pipeline,
     std_feat)],
    remainder='passthrough')

X_resampled = full_pipeline.fit_transform(X_resampled)

### Data Partitioning (Training supplied by AL, test and validation by train test split)

In [6]:
# Without Stratified Shuffle Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

### Cross Validation

In [8]:
# Cross validation random forest
n_est = [int(x) for x in np.linspace(start = 100, stop = 400, num = 100)]
rfc = RandomForestClassifier()

# Parameter Grid
param_grid_rfc = {
    'n_estimators': n_est,
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': ['True']
}

# Initialise GridSearchCV
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid_rfc, cv=5)
CV_rfc.fit(X_val, y_val)
CV_rfc.best_params_

{'bootstrap': 'True',
 'criterion': 'gini',
 'max_features': 'auto',
 'n_estimators': 100}

### Modeling Ensemble Learners

In [9]:
# Initialize random forest and use GridSearchCV parameters to fill in parameters of random forest
n_thread =  effective_n_jobs(-1)
rfc = RandomForestClassifier(bootstrap = 'True', criterion ='gini', n_estimators = 100, n_jobs=n_thread, max_features = 'auto')

# Train Model
rfc.fit(X_train, y_train)

# Predict using test data
y_pred_rfc = rfc.predict(X_test)

# Show model performance
add_model_eval('Random Forest Classifier', recall_score(y_test, y_pred_rfc), f1_score(y_test, y_pred_rfc), accuracy_score(y_test, y_pred_rfc), precision_score(y_test, y_pred_rfc))
view_models_eval()

Unnamed: 0,model,recall,f1_score,accuracy_score,precision_score
0,Random Forest Classifier,0.9,0.93,0.93,0.96


In [10]:
# Boosting Classifier
gradientB = GradientBoostingClassifier()

param_grid_gbc = {
    'n_estimators': list(range(100, 150, 10)),
    'learning_rate': [0.01, 0.025, 0.05, 0.075],
    'max_features':['log2','sqrt', 'auto'],
    'subsample':[0.5, 0.85, 0.95]
}

# Initialise GridSearchCV
CV_gbc = GridSearchCV(estimator=gradientB,
                      param_grid=param_grid_gbc,
                      cv=5)

# Fit the data on model
CV_gbc.fit(X_train,
           y_train)
CV_gbc.best_params_

{'learning_rate': 0.075,
 'max_features': 'log2',
 'n_estimators': 110,
 'subsample': 0.85}

In [None]:
# Initialize Gradient Booster and use GridSearchCV parameters to fill in parameters of Gradient Booster
gradientB = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.05, max_features = 'sqrt', subsample =0.5)

# Train Model
gradientB.fit(X_train, y_train)

# Predict using test data
y_pred_gb = gradientB.predict(X_test)

# Show model performance
add_model_eval('Gradient Boosting Classifier', recall_score(y_test, y_pred_gb), f1_score(y_test, y_pred_gb), accuracy_score(y_test, y_pred_gb), precision_score(y_test, y_pred_gb))
view_models_eval()

### Model metrics

In [None]:
# Generate precision recall curve values: precision, recall, thresholds
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, y_pred_rfc)
auc_precision_recall_rf = auc(recall_rf, precision_rf)

# Plot ROC curve
plt.plot(precision_rf, recall_rf, label="AUC="+str(auc_precision_recall_rf))
plt.ylabel('Precision', fontsize=13)
plt.xlabel('Recall', fontsize=13)
plt.title('Random Forest Classifier', fontsize=10)
plt.suptitle('Area Under the Precision Recall Curve', fontsize=13)
plt.show()

In [None]:
# Generate precision recall curve values: precision, recall, thresholds
precision_gb, recall_gb, thresholds_gb = precision_recall_curve(y_test, y_pred_gb)
auc_precision_recall_gb = auc(recall_gb, precision_gb)

# Plot ROC curve
plt.plot(precision_gb, recall_gb,label="AUC="+str(auc_precision_recall_gb))
plt.ylabel('Precision', fontsize=13)
plt.xlabel('Recall', fontsize=13)
plt.title('Gradient Boost Classifier', fontsize=10)
plt.suptitle('Area Under the Precision Recall Curve', fontsize=13)
plt.show()

In [None]:
tn_rf, fp_rf, fn_rf, tp_rf = confusion_matrix(y_test, y_pred_rfc).ravel()
specificity_rfc = tn_rf / (tn_rf+fp_rf)
specificity_rfc

In [None]:
tn_gb, fp_gb, fn_gb, tp_gb = confusion_matrix(y_test, y_pred_gb).ravel()
specificity_gbc = tn_gb / (tn_gb+fp_gb)
specificity_gbc