# Machine Learning Engineer Nanodegree
## Capstone Project
### Credit Card Fraud Detection


### RandomForest Classifier Tuning Notebook

In [1]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection  import train_test_split,KFold, cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,\
recall_score,classification_report,accuracy_score,precision_score,f1_score,make_scorer,average_precision_score
#from imblearn.over_sampling import SMOTE
from time import time
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
#from xgboost import XGBClassifier
#import seaborn as sns
%matplotlib inline

In [2]:
### Belows CSVs are Pre-Processed in Main Notebook(Credit_Card_Fraud_Detection_Capstone_Proj)
X_resampled_train=pd.read_csv('x_train.csv')
y_resampled_train=pd.read_csv('y_train.csv')
X_test=pd.read_csv('x_test.csv')
y_test=pd.read_csv('y_test.csv')

In [3]:
print('---------------Resampled data statistics---------------')
normal_trans_perc=sum(y_resampled_train['Class']==0)/(sum(y_resampled_train['Class']==0)+sum(y_resampled_train['Class']==1))
fraud_trans_perc=1-normal_trans_perc
print('Total number of records : {} '.format(len(y_resampled_train)))
print('Total number of normal transactions : {}'.format(sum(y_resampled_train['Class']==0)))
print('Total number of  fraudulent transactions : {}'.format(sum(y_resampled_train['Class']==1)))
print('Percent of normal transactions is : {:.4f}%,  fraudulent transactions is : {:.4f}%'.format(normal_trans_perc*100,fraud_trans_perc*100))

---------------Resampled data statistics---------------
Total number of records : 202999 
Total number of normal transactions : 199019
Total number of  fraudulent transactions : 3980
Percent of normal transactions is : 98.0394%,  fraudulent transactions is : 1.9606%


### Model Building ###

In [4]:
def train_predict(learner, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on       
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
   
    start = time() # Get start time
    learner.fit(X_train, y_train)
    end = time() # Get end time
      
    results['train_time'] = end - start
        
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    
    predictions_test_prob = learner.predict_proba(X_test)[:,1]
    predictions_train_prob = learner.predict_proba(X_train)[:,1]
    
    
    end = time() # Get end time
        
    results['pred_time'] =end - start
            
    
    results['acc_train'] = accuracy_score(y_train, predictions_train)      
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    results['rec_train'] = recall_score(y_train, predictions_train)      
    results['rec_test'] = recall_score(y_test, predictions_test)
    
    results['prec_train'] = precision_score(y_train, predictions_train)      
    results['prec_test'] = precision_score(y_test, predictions_test)
    
    
    results['f1_train'] = f1_score(y_train, predictions_train)
    results['f1_test'] = f1_score(y_test, predictions_test)
    
    results['auc_train'] = average_precision_score(y_train, predictions_train_prob,average='weighted')
    results['auc_test'] = average_precision_score(y_test, predictions_test_prob,average='weighted')
    
    
       
    # Success
    print("{} trained in time {:.4f} ".format(learner.__class__.__name__,(end - start)))
        
    # Return the results
    return results

In [8]:
# Initialize and train the basic model
clf_rf = RandomForestClassifier(random_state=0)
#clf_xg = XGBClassifier()

# Collect results on the learners
results = {}
clf_name = clf_rf.__class__.__name__
results[clf_name] = {}
results[clf_name] = train_predict(clf_rf, X_resampled_train, y_resampled_train.values.ravel(), X_test, y_test.values.ravel())


RandomForestClassifier trained in time 0.5815 


In [9]:
rf_res=pd.DataFrame(results['RandomForestClassifier'],index=['RF'])


In [10]:
rf_res[['train_time','pred_time','acc_train','acc_test','rec_train','rec_test',\
         'prec_train','prec_test','f1_train','f1_test','auc_train','auc_test']]

Unnamed: 0,train_time,pred_time,acc_train,acc_test,rec_train,rec_test,prec_train,prec_test,f1_train,f1_test,auc_train,auc_test
RF,11.162182,0.581533,0.999926,0.999579,0.996482,0.809524,0.999748,0.937008,0.998112,0.868613,0.999997,0.836311


### RandomForecast Classifier Model Tuning 

In [11]:
def XGB_class_tune(clf,param_set1):
    start = time() # Get start time    
    scorer = make_scorer(recall_score)
    grid_obj = GridSearchCV(estimator=clf, param_grid=param_set1, scoring=scorer,cv=5)

    grid_fit = grid_obj.fit(X_resampled_train, y_resampled_train.values.ravel())

    # Get the estimator
    best_clf = grid_fit.best_estimator_

    # Make predictions using the unoptimized and model
    predictions = (clf.fit(X_resampled_train, y_resampled_train.values.ravel())).predict(X_test)
    predictions_prob = clf.predict_proba(X_test)[:,1]

    best_predictions = best_clf.predict(X_test)
    best_predictions_prob = best_clf.predict_proba(X_test)[:,1]


    # Report the before-and-afterscores
    print("Current model\n------")
    print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("F1-score on testing data: {:.4f}".format(f1_score(y_test, predictions)))
    print("recall on testing data: {:.4f}".format(recall_score(y_test, predictions)))
    print("precision score on testing data: {:.4f}".format(precision_score(y_test, predictions)))
    print("Precision recall auc on testing data: {:.4f}".\
          format(average_precision_score(y_test, predictions_prob,average='weighted')))

    print("\nNew Model\n------")
    print("accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
    print("F1-score on the testing data: {:.4f}".format(f1_score(y_test, best_predictions)))
    print("Recall score on the testing data: {:.4f}".format(recall_score(y_test, best_predictions)))
    print("precision score on testing data: {:.4f}".format(precision_score(y_test, best_predictions)))
    print("Precision recall auc on testing data: {:.4f}".\
          format(average_precision_score(y_test, best_predictions_prob,average='weighted')))

    end = time() # Get end time
    # show best parameters
    print("\nBest Classifier\n------")
    print(best_clf)
    print('time taken',end-start)

In [12]:
clf = RandomForestClassifier(random_state=0)
param_set1 = {'max_depth': (15, 20)}
XGB_class_tune(clf,param_set1)

Current model
------
Accuracy score on testing data: 0.9996
F1-score on testing data: 0.8686
recall on testing data: 0.8095
precision score on testing data: 0.9370
Precision recall auc on testing data: 0.8363

New Model
------
accuracy score on the testing data: 0.9996
F1-score on the testing data: 0.8645
Recall score on the testing data: 0.8027
precision score on testing data: 0.9365
Precision recall auc on testing data: 0.8325

Best Classifier
------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)
time taken 111.07553219795227


In [13]:
clf = RandomForestClassifier(random_state=0,,max_depth=20)
param_set1 = {'min_samples_split': (3,4)}
XGB_class_tune(clf,param_set1)

Current model
------
Accuracy score on testing data: 0.9996
F1-score on testing data: 0.8686
recall on testing data: 0.8095
precision score on testing data: 0.9370
Precision recall auc on testing data: 0.8363

New Model
------
accuracy score on the testing data: 0.9996
F1-score on the testing data: 0.8603
Recall score on the testing data: 0.7959
precision score on testing data: 0.9360
Precision recall auc on testing data: 0.8386

Best Classifier
------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)
time taken 112.05515432357788


In [15]:
clf = RandomForestClassifier(random_state=0,max_depth=20)
param_set1 = {'min_samples_leaf': (4,5)}
XGB_class_tune(clf,param_set1)

Current model
------
Accuracy score on testing data: 0.9996
F1-score on testing data: 0.8645
recall on testing data: 0.8027
precision score on testing data: 0.9365
Precision recall auc on testing data: 0.8325

New Model
------
accuracy score on the testing data: 0.9995
F1-score on the testing data: 0.8521
Recall score on the testing data: 0.8231
precision score on testing data: 0.8832
Precision recall auc on testing data: 0.8323

Best Classifier
------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)
time taken 113.91074967384338


In [16]:
clf = RandomForestClassifier(random_state=0,max_depth=20,min_samples_leaf=5)
param_set1 = {'n_estimators': (12,14,16)}
XGB_class_tune(clf,param_set1)

Current model
------
Accuracy score on testing data: 0.9995
F1-score on testing data: 0.8521
recall on testing data: 0.8231
precision score on testing data: 0.8832
Precision recall auc on testing data: 0.8323

New Model
------
accuracy score on the testing data: 0.9995
F1-score on the testing data: 0.8511
Recall score on the testing data: 0.8163
precision score on testing data: 0.8889
Precision recall auc on testing data: 0.8394

Best Classifier
------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=12, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)
time taken 211.94233965873718


### Final Tuned RandomClassifier Model

In [17]:
clf = RandomForestClassifier(random_state=0,max_depth=20,min_samples_leaf=5) 

# Collect results on the learners
results = {}
clf_name = clf.__class__.__name__
results[clf_name] = {}
results[clf_name] = train_predict(clf, X_resampled_train, y_resampled_train.values.ravel(), X_test, y_test.values.ravel())

RandomForestClassifier trained in time 0.5917 


In [18]:
rf_res=pd.DataFrame(results['RandomForestClassifier'],index=['RF'])
rf_res[['train_time','pred_time','acc_train','acc_test','rec_train','rec_test',\
         'prec_train','prec_test','f1_train','f1_test','auc_train','auc_test']]

Unnamed: 0,train_time,pred_time,acc_train,acc_test,rec_train,rec_test,prec_train,prec_test,f1_train,f1_test,auc_train,auc_test
RF,11.612616,0.591732,0.999675,0.999508,0.987186,0.823129,0.996197,0.883212,0.991671,0.852113,0.999478,0.83234
