In [1]:
#import section
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, MinMaxScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, f1_score

In [3]:
data = pd.read_csv("data/creditcard.csv")

In [9]:
train, test = train_test_split(data, test_size = 0.4, stratify=data['Class'], random_state = 42)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [10]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
print(train.columns)
print(test.columns)

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


Let's get some info about data:

In [11]:
X = data.drop(['Class'], axis = 1)
y = data['Class']
print("Num of X col:", X.shape[1])
print("Num of data col:", data.shape[1])
print("Num of X rows:", X.shape[0])
print("_____________________________")
print("Target class distriburion:", "\n", y.value_counts()) # Data are extremely unbalanced
print("Frauds percentage:", round((y.value_counts()[1]/len(y) * 100), 2), " %")
print("_____________________________")
print(X.describe()) # "Time" and "Amount" features is not scaled
print("_____________________________")
print(X.info()) # Data do not contain any missing or categorical values

Num of X col: 30
Num of data col: 31
Num of X rows: 284807
_____________________________
Target class distriburion: 
 0    284315
1       492
Name: Class, dtype: int64
Frauds percentage: 0.17  %
_____________________________
                Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  3.919560e-15  5.688174e-16 -8.769071e-15  2.782312e-15   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7       

As result of first look at the data we can conclude that the data do not contain missing or categorical values. "Time" and "Amount" features are not scaled. Thus we don't need to bother about imputing and encoding of the data, however maybe we need to scale couple of columns. 
Another important conclusion is that this dataset is extremely unbalanced, which means that, probably, we'll nedd to use resample techniques for these data.


Now we need to create test dataset. We'll use train_test_split for this purpose with "stratify = y" in order that test dataset will contain the same amount of fraud observations as a train dataset.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,stratify = y, random_state = 42)
print("Train frauds percentage:", y_train.value_counts()[1]/len(y_train) * 100, " %")
print("Test frauds percentage:", y_test.value_counts()[1]/len(y_test) * 100, " %")

Train frauds percentage: 0.17254870488152324  %
Test frauds percentage: 0.17321489179921118  %


Let's do some naive predictions assuming that all transactions with credit card were valid and there were no any frauds.
And let's check how good this prediction on test set. Since these data are very unbalanced accuracy_score is not the best metric, but still it's very representative. So we are gonna use accuracy_score, f1_score, and confusion matrix for our predictions performance. 

In [17]:
naive_pred_list = [0]*len(y_test)
naive_pred = pd.Series(naive_pred_list)
print("Accuracy_score:", accuracy_score(naive_pred, y_test))
print("F1_score:", f1_score(naive_pred, y_test))
print("Confusion_matrix:", "\n", confusion_matrix(naive_pred, y_test))


Accuracy_score: 0.9982678510820079
F1_score: 0.0
Cconfusion_matrix: 
 [[85295   148]
 [    0     0]]


As expected we obtained extremely good accuracy score, since most of the data contain "0" as target. However, if we want to predict fraud cases, such model is not suitable. Since our prediction doesn't contain any "1", we get f1_score equal to zero.
So this isn't useful for prediction, however it's useful as baseline model. Thus we can compare any further model with this one. 

Let's create a bunch of simple models and check what model works best on the raw data.

**So from now we will be use X_train and y_train for our model learning.**

In [22]:
# Let's create a bunch of simple models

models = [RandomForestClassifier(),
          KNeighborsClassifier(),
         DecisionTreeClassifier(),
         SVC(),
         RidgeClassifier()]

def model_perform(X_train, y_train, X_test, y_test):
    for model in models:
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        print("___________________________________")
        print(model.__class__.__name__ + " accuracy_score:", accuracy_score(pred, y_test))
        print(model.__class__.__name__ + " f1_score:", f1_score(pred, y_test))
        print(model.__class__.__name__ + " confusion matrix:", confusion_matrix(pred, y_test))
        print("___________________________________")

model_perform(X_train, y_train, X_test, y_test)

___________________________________
RandomForestClassifier accuracy_score: 0.9995318516437859
RandomForestClassifier f1_score: 0.849624060150376
RandomForestClassifier confusion matrix: [[85290    35]
 [    5   113]]
___________________________________
___________________________________
KNeighborsClassifier accuracy_score: 0.9983029622087239
KNeighborsClassifier f1_score: 0.03973509933774835
KNeighborsClassifier confusion matrix: [[85295   145]
 [    0     3]]
___________________________________
___________________________________
DecisionTreeClassifier accuracy_score: 0.9991573329588147
DecisionTreeClassifier f1_score: 0.7428571428571429
DecisionTreeClassifier confusion matrix: [[85267    44]
 [   28   104]]
___________________________________
___________________________________
SVC accuracy_score: 0.9982678510820079
SVC f1_score: 0.0
SVC confusion matrix: [[85295   148]
 [    0     0]]
___________________________________
___________________________________
RidgeClassifier accuracy_s

As we can see, RandomForest perform much better on the raw data then any other model. Only 42 mistakes in RF prediction, so the highest f1_score obtained. 
Now, let's try to scale "Amount" column and drop "Time" column (it doesn't seems useful, because it shows time between current operation and the first operation in dataset)

In [17]:
# Drop "Time" column
X_train = X_train.drop(["Time"], axis = 1)
X_test = X_test.drop(["Time"], axis = 1)

# MinMax scaling of "Amount" column
X_train["Amount"] = (X_train["Amount"] - X_train["Amount"].min()) / (X_train["Amount"].max() - X_train["Amount"].min())
X_test["Amount"] = (X_test["Amount"] - X_train["Amount"].min()) / (X_train["Amount"].max() - X_train["Amount"].min())


Let's check now how different models perform on preprocessed data:

In [24]:
model_perform(X_train, y_train, X_test, y_test)


___________________________________
RandomForestClassifier accuracy_score: 0.9995435553526912
RandomForestClassifier f1_score: 0.8539325842696629
RandomForestClassifier confusion matrix: [[85290    34]
 [    5   114]]
___________________________________
___________________________________
KNeighborsClassifier accuracy_score: 0.999403110845827
KNeighborsClassifier f1_score: 0.8089887640449438
KNeighborsClassifier confusion matrix: [[85284    40]
 [   11   108]]
___________________________________
___________________________________
DecisionTreeClassifier accuracy_score: 0.9992158515033414
DecisionTreeClassifier f1_score: 0.7615658362989324
DecisionTreeClassifier confusion matrix: [[85269    41]
 [   26   107]]
___________________________________
___________________________________
SVC accuracy_score: 0.9986891846026006
SVC f1_score: 0.4105263157894737
SVC confusion matrix: [[85292   109]
 [    3    39]]
___________________________________
___________________________________
RidgeClassif

As we can see scaling "Amount" feature and drop "Time" feature lead to improvement of most models performance.
Still the best model is RandomForest, however it's performance almost the same as for the raw data.
RidgeClassifier performs much worse with preprocessed data. So from now for predictions we are gonna use only RandomForest model.

Now let's try sampling technique for the data and check whether will it be better. Here we are gonna use oversampling in order to keep high enough observations.

In [25]:
from sklearn.utils import resample

# define func that will provide oversampling to the dataset
def resampling(X_train, y_train, n):
    X_conc = pd.concat([X_train, y_train], axis=1)
    #Seperate minority and majority class:
    not_fraud = X_conc[X_conc['Class'] == 0]
    fraud = X_conc[X_conc['Class'] == 1]
    
    #Now use the oversampling techniques
    random_sampling = resample(fraud,
                               replace=True,
                               n_samples = int(n * len(not_fraud)),
                               random_state = 42
                              )
    
    #combine minority and upsample data
    upsample = pd.concat([not_fraud,random_sampling])
    #Check new values are balances for the both classes or not
    print(upsample['Class'].value_counts())
    new_y_train = upsample.Class
    new_x_train = upsample.drop('Class', axis=1)
    return new_x_train, new_y_train


In [30]:
# Here let's check a couple of oversampled datasets

j = [0.1, 0.3, 0.5] # fraud to non-fraud ratios
for n in j:
    RF_model = RandomForestClassifier()
    new_x_train, new_y_train = resampling(X_train, y_train, n)
    RF_model.fit(new_x_train, new_y_train)
    pred = RF_model.predict(X_test)
    print("___________________________________")
    print(" accuracy_score:", accuracy_score(pred, y_test))
    print(" f1_score:", f1_score(pred, y_test))
    print(" confusion matrix:", confusion_matrix(pred, y_test))
    print("___________________________________")

Total sample which are not fraud : 199020
Total Fraud samples : 344
0    199020
1     19902
Name: Class, dtype: int64
___________________________________
 accuracy_score: 0.9994850368081645
 f1_score: 0.8307692307692307
 confusion matrix: [[85291    40]
 [    4   108]]
___________________________________
Total sample which are not fraud : 199020
Total Fraud samples : 344
0    199020
1     59706
Name: Class, dtype: int64
___________________________________
 accuracy_score: 0.9994733330992591
 f1_score: 0.8262548262548263
 confusion matrix: [[85291    41]
 [    4   107]]
___________________________________
Total sample which are not fraud : 199020
Total Fraud samples : 344
0    199020
1     99510
Name: Class, dtype: int64
___________________________________
 accuracy_score: 0.9994499256814484
 f1_score: 0.8171206225680935
 confusion matrix: [[85291    43]
 [    4   105]]
___________________________________


As we can see, oversampling technique does not improve performance of model. The more samples we create the worse performance we obtain. Thus for the final predictions we are not gonna use it.

So for now we defined that RandomForest works best for these data. Sampling technique do not improve performance.
Thus let's tune a little pur RandomForest model, maybe we can improve our model further. We'll use GridSearchCV for this purpose.

In [18]:
grid_params = dict(n_estimators = [100, 300, 500],
                  min_samples_leaf = [1, 2, 4])
#                  max_features = ["auto", "sqrt", "log2"])
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=grid_params, scoring='f1', cv=4)
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [36]:
print('Best GridSearh f1_score: ', grid_search.best_score_)

0.8494239848668931

In [37]:
grid_pred = grid_search.predict(X_test)
print(grid_pred)
grid_search.best_params_

[0 0 0 ... 0 0 0]


{'n_estimators': 300}

In [38]:
#roc_auc_score(pred, y_test)
print("accuracy_score:", accuracy_score(grid_pred, y_test))
print("roc_auc_score:", roc_auc_score(grid_pred, y_test))
print("f1_score:", f1_score(grid_pred, y_test))
print(confusion_matrix(grid_pred, y_test))
print("___________________________________")

accuracy_score: 0.9995318516437859
roc_auc_score: 0.9748007571229329
f1_score: 0.8507462686567164
[[85289    34]
 [    6   114]]
___________________________________
