In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score, accuracy_score, recall_score, f1_score



In [3]:
df = pd.read_csv("creditcard.csv")
#df.describe()

#### Feature description
'PCA' features - V1, V2, ... V28 <br>
'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. <br>
'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. <br>
'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise. <br>

492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

In [4]:
# Looking at the class distributions
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [5]:
#creating time based features:

# separate transactions by day - 1 for day2 or 0 for day1
#df['day'] = df['Time'].map( lambda x: 1 if x > 3600 * 24 else 0)

## make transaction relative to day
#df['time_of_day'] = df.apply(lambda row: (row['Time']-86400) if(row['day']==1) else(row['Time']), axis=1)

#### shuffle the data so that it becomes truly random

In [None]:
#df = shuffle(df)

### We will create train-test splits here itself and then work with resampling techniques as in real world the assumption is that test data is always unseen,and hence cannot be resampled.

In [4]:
y = df['Class']
X = df.drop('Class',axis=1)

In [5]:
from sklearn.cross_validation import train_test_split

# Whole dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,y,test_size = 0.30, random_state = 999)

print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

Number transactions train dataset:  199364
Number transactions test dataset:  85443
Total number of transactions:  284807


### Re-sampling techniques

#### Technique 1 - SMOTEENN

In [6]:
from imblearn.combine import SMOTEENN

sm1 = SMOTEENN()
X_SMOTEENN, Y_SMOTEENN = sm1.fit_sample(X_train, Y_train)
df_SMOTEENN = pd.DataFrame(X_SMOTEENN)

#### Technique 2 - SMOTETOMEK

In [None]:
from imblearn.combine import SMOTETomek
sm2 = SMOTETomek()

X_SMOTETomek, Y_SMOTETomek = sm2.fit_sample(X_train, Y_train)
df_SMOTETomek = pd.DataFrame(X_SMOTETomek)
#df_SMOTETomek = df_SMOTETomek.assign(Class=Y_SMOTETomek)

### ML algorithms

In [7]:
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV

### With original Data

In [14]:
input_data = df_SMOTEENN
input_labels = Y_SMOTEENN
test_data = X_test
test_labels = Y_test

### Shuffled Data

In [10]:
#input_data = shuffle(df_SMOTEENN)

### With resampled Data

In [None]:
model = {
    'lr' : LogisticRegression(),
    'rfc': RandomForestClassifier(),
    'gbc': GradientBoostingClassifier()
}

model_parameters = {
        'rfc':{
            'n_estimators': [50 ,100, 30], 
            'max_depth': [None, 1, 2, 3, 5], 
            'min_samples_split': [2, 3, 5]
        },
        'gbc':{
            'loss':['deviance', 'exponential'],
            'n_estimators': [50, 100, 150], 
            'criterion':['friedman_mse','mae'],
            'max_depth': [None, 1, 2, 3, 5], 
            'min_samples_split': [2, 3, 5]
        },
        'lr':{
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'penalty':['l1','l2']
        }
}

skf = StratifiedKFold(n_splits=5)
for m in model:
    print("Executing model - ",m)
    clf = GridSearchCV(estimator = model[m], param_grid = model_parameters[m], cv=skf, scoring='recall')
    clf.fit(input_data, input_labels)
    print(clf.best_params_)

    test_pred = clf.predict(test_data)

    #print("Grid scores: ", clf.grid_scores_)
    #print("Best score on training set: ", clf.best_score_)
    print('Accuracy: ', str(np.round(accuracy_score(test_labels, test_pred),5)))
    print('Cohen Kappa: ' + str(np.round(cohen_kappa_score(test_labels, test_pred),5)))
    print('Recall: ' + str(np.round(recall_score(test_labels, test_pred),5)))
    #print('F1: ' + str(np.round(f1_score(test_labels, test_pred),5)))
    
    #model dump
    
    print('Writing best estimator to file for: ', m)
    fname = m + '-model.pkl'
    with open(fname,'wb') as f:
        pickle.dump(clf.best_estimator_, f)
        f.close()-
    

### Results for each dataset

#### for original data
Executing model -  lr
Best score on training set:  0.470703964588
Accuracy:  0.99877
Cohen Kappa: 0.57675
Recall: 0.45652


Executing model -  rfc
Best score on training set:  0.594077266101
Accuracy:  0.99916
Cohen Kappa: 0.72145
Recall: 0.59239

Executing model -  gbc
Best score on training set:  0.743152877901
Accuracy:  0.99906
Cohen Kappa: 0.7181
Recall: 0.65217


#### for resampled data - SMOTEENN
Executing model -  lr
Accuracy:  0.98976
Cohen Kappa: 0.23847
Recall: 0.88043

Executing model -  rfc
Accuracy:  0.99964
Cohen Kappa: 0.89006
Recall: 0.89024

Executing model -  gbc
Accuracy:  0.99466
Cohen Kappa: 0.35888
Recall: 0.91463

#### for resampled data - SMOTETOMEK
Executing model -  lr
Accuracy:  0.98948
Cohen Kappa: 0.22473
Recall: 0.93902

Executing model -  rfc
Accuracy:  0.99964
Cohen Kappa: 0.88939
Recall: 0.88415

Executing model -  gbc
Accuracy:  0.99365
Cohen Kappa: 0.3197
Recall: 0.91463

### without new featureas
Executing model -  lr
Accuracy:  0.99059
Cohen Kappa: 0.24503
Recall: 0.93617

Executing model -  rfc
Accuracy:  0.99964
Cohen Kappa: 0.8895
Recall: 0.88652

Executing model -  gbc
Accuracy:  0.99393
Cohen Kappa: 0.33024
Recall: 0.91489

### SMOTETomek method, with no additional features and without shuffling the data
Executing model -  lr
Accuracy:  0.98425
Cohen Kappa: 0.15936
Recall: 0.92199

Executing model -  rfc
Accuracy:  0.9996
Cohen Kappa: 0.88008
Recall: 0.88652

Executing model -  gbc
Accuracy:  0.99022
Cohen Kappa: 0.23781
Recall: 0.93617

#### As we can see resampled data (SMOTETomek method) performs best with a recall of 94% for simple logistic regression and gradient boosted trees method<br>

### Evaluation in imbalanced domains
Traditionally, the accuracy rate has been the most commonly used empirical measure.
However in case of imbalanced data, it cannot distinguish between the number of correctly classified examples of different classes. Comparing some of the metrics below : 
1. The <b> Area Under the ROC curve (AUC)</b> is equal to the probability that a random positive example will be ranked above a random negative example.
2. The <b> F1 Score</b> is the harmonic mean of precision and recall. 
3. <b> Cohen’s Kappa</b> is an evaluation statistic that takes into account how much agreement would be expected by chance
4. <b> Recall/Sensitivity </b> is the ability of a test to correctly identify true positive rate.
5. <b> Precision Recall curve </b> The precision-recall curve shows the tradeoff between precision and recall for different threshold. A high area under the curve represents both high recall and high precision, where high precision relates to a low false positive rate, and high recall relates to a low false negative rate. High scores for both show that the classifier is returning accurate results (high precision), as well as returning a majority of all positive results (high recall).

I have used <b>Recall</b> as the metric to evaluate the model as it  helps identify how well the model performs for the more important but less frequent class(fraudulent transactions). PR curve can also be used, however, it is ineffective when the data is re-sampled which is the case for this analysis.

### Forward chaining cross validation technique for Time Series Data

Time-series is problematic for cross-validation bacause if a pattern emerges in year 3 and stays for year 5, then the model should be able to pick up on it, even though it wasn't part of years 1 & 2. <br>
For this we can use the forward chaining technique: <br>
fold 1 : training [1], test [2] <br>
fold 2 : training [1 2], test [3] <br>
fold 3 : training [1 2 3], test [4] <br>
fold 4 : training [1 2 3 4], test [5] <br>
<br>
This is a possible approach for this problem if we had some continuance in the data or if we were modelling transactions over a period of time. However, the question doesn't explicitly state the exact time of transactions(it is relative) and hence this technique will not be explored further.
<br>