In [3]:
import pandas as pd 
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC 
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier,
                             ExtraTreesClassifier)
import d2l 
import mxnet as mx
from mxnet import gluon, np, npx 
from mxnet.gluon import nn
npx.set_np()


In [4]:
data_path = r'C:\Users\Arnaud wanet\Documents\Machine_learning_book_2nd_playground\data'
data_path = data_path + '/creditcard.csv'

df = pd.read_csv(data_path)

In [5]:
df.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [7]:
len(df)

284807

In [8]:
df.shape

(284807, 31)

In [9]:
df['Class'].unique()

array([0, 1], dtype=int64)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time      284807 non-null float64
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26  

In [15]:
features = df.drop(df['Class'],inplace=True)
labels = df['Class']

In [21]:
features_columns = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

features = df[features_columns]
labels = df['Class']

In [22]:
features.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(features, labels,
                                                   test_size=0.2, random_state=42)

### Solution sequence
        - Resample the dataset
        - split the dataset 
        - Perform GRIDsearch to find the best classifier
        - extra: use a Conv1D net to make the classification

In [30]:
# We first start by spot checking some algorithms to 
# see if the model won't be impacted by the imabalanced 
# nature of our dataset 
from sklearn.model_selection import cross_val_score

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

results = []
test_set_results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=42)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    test_score = model.fit(X_train, Y_train).score(X_test, Y_test)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    test_msg = "%s score on the test set %f" % (name, test_score)
    print(msg)
    print(test_msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LR: 0.999017 (0.000196)
LR score on the test set 0.998859
LDA: 0.999416 (0.000206)
LDA score on the test set 0.999315




KNN: 0.998345 (0.000256)
KNN score on the test set 0.998473




CART: 0.999091 (0.000176)
CART score on the test set 0.999157




NB: 0.993421 (0.000652)
NB score on the test set 0.992855




SVM: 0.998253 (0.000239)
SVM score on the test set 0.998350


"""Accuracy is not a good measure of classifier accuracy on imbalanced datasets
these models are visibly learning for the majority class lets resample the dataset"""

In [31]:
import imblearn

Using MXNet backend


In [35]:
from imblearn.over_sampling import SMOTE

In [36]:
sm = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(features, labels)

In [39]:
X_res.shape

(568626, 30)

In [40]:
Y_res.shape

(568626,)

In [41]:
X_train_res, X_test_res, Y_train_res, Y_test_res = train_test_split(X_res, Y_res,
                                                                   test_size=.2,
                                                                   random_state=42)

In [None]:
results = []
test_set_results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)
    cv_results = cross_val_score(model, X_train_res, Y_train_res, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    test_score = model.fit(X_train_res, Y_train_res).score(X_test, Y_test)
    test_score_res = model.fit(X_train_res, Y_train_res).score(X_test_res, Y_test_res)
    msg = "%s score on the resampled data and std: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    test_msg = "%s score on the test set %f" % (name, test_score)
    test_msg_res = "%s score on the resampled test set %f" % (name, test_score_res)
    print(msg)
    print(test_msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LR score on the resampled data and std: 0.969558 (0.006052)
LR score on the test set 0.981584
LDA score on the resampled data and std: 0.933199 (0.001054)
LDA score on the test set 0.988185
KNN score on the resampled data and std: 0.959624 (0.000910)
KNN score on the test set 0.959832
CART score on the resampled data and std: 0.998391 (0.000334)
CART score on the test set 0.999526
NB score on the resampled data and std: 0.868123 (0.000942)
NB score on the test set 0.991801
