In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [2]:
base = pd.read_csv('creditcard.csv')

In [3]:
base.Amount = base.Amount / base.Amount.max()
base.Time = base.Time / base.Time.max()

In [4]:
X = base.drop('Class',axis=1)
y = base.Class

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=22, stratify = y)

## UNDER SAMPLER

In [6]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=22)
X_res_under, y_res_under = rus.fit_resample(X_train, y_train)

In [7]:
from sklearn.linear_model import LogisticRegression
clf_RL = LogisticRegression(random_state=22).fit(X_res_under, y_res_under)
y_pred_RL = clf_RL.predict(X_test)
y_pred_proba_RL = clf_RL.predict_proba(X_test)[:,1]

In [8]:
precision_RL, recall_RL, thresholds_RL = metrics.precision_recall_curve(y_test, y_pred_RL)

In [9]:
print(metrics.auc(recall_RL,precision_RL))

0.4782144609062044


In [10]:
from sklearn.metrics import recall_score

In [11]:
recall_score(y_test, y_pred_RL)

0.9197530864197531

In [12]:
confusion_matrix(y_test, y_pred_RL)

array([[89896,  3929],
       [   13,   149]], dtype=int64)

## CLUSTER CENTROIDS

In [13]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=42)
X_resCC, y_resCC = cc.fit_resample(X_train, y_train)



In [14]:
# SVC

from sklearn.svm import SVC

parametros = {
    'C': [0.01, 0.1, 0.2, 0.5,1, 10, 20,50],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto' ]
}

SVC_GS = SVC(random_state=0,probability=True)
clf_GS = GridSearchCV(SVC_GS, parametros, scoring = 'recall').fit(X_resCC, y_resCC)
y_pred_GS = clf_GS.predict(X_test)
confusion_matrix(y_test, y_pred_GS)

array([[87059,  6766],
       [   14,   148]], dtype=int64)

In [15]:
metrics.recall_score(y_test, y_pred_GS)

0.9135802469135802

In [16]:
# RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier
RF_GS = RandomForestClassifier(random_state=0)

parametros = {
#     'n_estimators' : [200,220, 300,500],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [3,5,7,8,9,10,11,22]
}

clf_GS_FO = GridSearchCV(RF_GS, parametros, scoring = 'recall').fit(X_resCC, y_resCC)
y_pred_GS_FO = clf_GS_FO.predict(X_test)
confusion_matrix(y_test, y_pred_GS_FO)

array([[68807, 25018],
       [    8,   154]], dtype=int64)

In [17]:
metrics.recall_score(y_test, y_pred_GS_FO)

0.9506172839506173

## NEAR MISS

In [18]:
import warnings
warnings.filterwarnings('ignore')

## GRIDSEARCH CV

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
parametros = {
    'C': [0.1],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet', None],
#     'max_iter': [220]
}

In [21]:
from sklearn.linear_model import LogisticRegression
LogReg = LogisticRegression(random_state=42)

In [22]:
clf_GS = GridSearchCV(LogReg, parametros,scoring=['recall','precision'],refit='recall')

In [23]:
clf_GS =  clf_GS.fit(X_res_under, y_res_under)

In [24]:
clf_GS.best_params_

{'C': 0.1, 'penalty': None, 'solver': 'newton-cg'}

In [25]:
y_pred_GS = clf_GS.predict(X_test)

In [26]:
confusion_matrix(y_test, y_pred_GS)

array([[87231,  6594],
       [   10,   152]], dtype=int64)

In [27]:
metrics.precision_score(y_test, y_pred_GS)

0.022531870738215238

In [28]:
# metrics.recall_score(y_test, y_pred_GS)
#0.9324324324324325

In [29]:
metrics.recall_score(y_test, y_pred_GS)

0.9382716049382716

In [30]:
# pd.DataFrame(clf_GS.cv_results_)

## SUPPORT VECTOR MACHINE (SVC)

In [31]:
parametros = {
    'C': [0.01, 0.1, 0.2, 0.5,1, 10, 20,50],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto' ]
}

In [32]:
from sklearn.svm import SVC

SVC_GS = SVC(random_state=0,probability=True)

# y_pred_SVC = SVC_GS.predict(X_test)
# y_pred_proba_SVC = clf_SVC.predict_proba(X_test)[:,1]

In [33]:
clf_GS = GridSearchCV(SVC_GS, parametros, scoring = 'recall')

In [34]:
clf_GS.fit(X_res_under, y_res_under)

In [35]:
clf_GS.best_params_

{'C': 20, 'gamma': 'auto', 'kernel': 'rbf'}

In [36]:
y_pred_GS = clf_GS.predict(X_test)

In [37]:
confusion_matrix(y_test, y_pred_GS)

array([[85961,  7864],
       [   10,   152]], dtype=int64)

In [38]:
metrics.recall_score(y_test, y_pred_GS)

0.9382716049382716

## RANDOM FOREST

In [39]:
from sklearn.ensemble import RandomForestClassifier
RF_GS = RandomForestClassifier(random_state=0)

In [40]:
parametros = {
#     'n_estimators' : [200,220, 300,500],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [3,5,7,8,9,10,11,22]
}

In [41]:
clf_GS_FO = GridSearchCV(RF_GS, parametros, scoring = 'recall')

In [42]:
clf_GS_FO = clf_GS_FO.fit(X_res_under, y_res_under)

In [43]:
clf_GS_FO.best_params_

{'criterion': 'gini', 'max_depth': 9}

In [44]:
y_pred_GS_FO = clf_GS_FO.predict(X_test)

In [45]:
confusion_matrix(y_test, y_pred_GS_FO)

array([[89956,  3869],
       [   17,   145]], dtype=int64)

In [46]:
metrics.recall_score(y_test, y_pred_GS_FO)

0.8950617283950617

## KNN

In [47]:
from sklearn.neighbors import KNeighborsClassifier
KNN_gs = KNeighborsClassifier()

In [48]:
parametros = {
        'n_neighbors' : [3,5,7,9,11,21,77],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [49]:
clf_GS_knn = GridSearchCV(KNN_gs, parametros, scoring = 'recall')

In [50]:
clf_GS_knn = clf_GS_knn.fit(X_res_under, y_res_under)

In [51]:
clf_GS_knn.best_params_

{'algorithm': 'auto', 'n_neighbors': 3}

In [52]:
y_pred_GS_knn = clf_GS_knn.predict(X_test)

In [53]:
confusion_matrix(y_test, y_pred_GS_knn)

array([[90837,  2988],
       [   17,   145]], dtype=int64)

In [54]:
metrics.recall_score(y_test, y_pred_GS_knn)

0.8950617283950617