In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
df = pd.read_csv('creditcard.csv')

In [62]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [63]:

X = df[['V10', 'V12', 'V14', 'V17', 'Amount']]
y = df.Class

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [65]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [66]:
def get_metrics(y, y_pred, y_proba):
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_proba)
    
    d = {'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1':f1, 'roc_auc':roc_auc}
    return d

# LogisticRegression

In [67]:
from sklearn.linear_model import LogisticRegression

In [68]:
logres = LogisticRegression(random_state=42)
logres.fit(X_train, y_train)

In [69]:
y_test_pred = logres.predict(X_test)
y_test_proba = logres.predict_proba(X_test)[:, 1]

y_train_pred = logres.predict(X_train)
y_train_proba = logres.predict_proba(X_train)[:, 1]

In [70]:
metrics = get_metrics(y_train, y_train_pred, y_train_proba)
for i in metrics:
    print(f'{i}_train:{metrics[i]:.4f}')

accuracy_train:0.9992
precision_train:0.8841
recall_train:0.5988
f1_train:0.7140
roc_auc_train:0.9590


In [71]:
metrics = get_metrics(y_test, y_test_pred, y_test_proba)
for i in metrics:
    print(f'{i}:{metrics[i]:.4f}')

accuracy:0.9991
precision:0.8469
recall:0.5608
f1:0.6748
roc_auc:0.9362


In [72]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.85      0.56      0.67       148

    accuracy                           1.00     85443
   macro avg       0.92      0.78      0.84     85443
weighted avg       1.00      1.00      1.00     85443



# DecisionTree

In [73]:
from sklearn.tree import DecisionTreeClassifier

In [74]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

In [75]:
y_test_pred = tree.predict(X_test)
y_test_proba = tree.predict_proba(X_test)[:, 1]  

y_train_pred = tree.predict(X_train)
y_train_proba = tree.predict_proba(X_train)[:, 1]  

In [76]:
metrics = get_metrics(y_train, y_train_pred, y_train_proba)
for i in metrics:
    print(f'{i}_train:{metrics[i]:.4f}')

accuracy_train:1.0000
precision_train:1.0000
recall_train:1.0000
f1_train:1.0000
roc_auc_train:1.0000


In [77]:
metrics = get_metrics(y_test, y_test_pred, y_test_proba)
for i in metrics:
    print(f'{i}:{metrics[i]:.4f}')

accuracy:0.9990
precision:0.7153
recall:0.6622
f1:0.6877
roc_auc:0.8309


In [78]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.72      0.66      0.69       148

    accuracy                           1.00     85443
   macro avg       0.86      0.83      0.84     85443
weighted avg       1.00      1.00      1.00     85443



# RandomForest

In [79]:
from sklearn.ensemble import RandomForestClassifier

In [80]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

In [81]:
y_test_pred = forest.predict(X_test)
y_test_proba = forest.predict_proba(X_test)[:, 1]  

y_train_pred = forest.predict(X_train)
y_train_proba = forest.predict_proba(X_train)[:, 1]  

In [82]:
metrics = get_metrics(y_train, y_train_pred, y_train_proba)
for i in metrics:
    print(f'{i}_train:{metrics[i]:.4f}')

accuracy_train:1.0000
precision_train:1.0000
recall_train:1.0000
f1_train:1.0000
roc_auc_train:1.0000


In [83]:
metrics = get_metrics(y_test, y_test_pred, y_test_proba)
for i in metrics:
    print(f'{i}:{metrics[i]:.4f}')

accuracy:0.9996
precision:0.9508
recall:0.7838
f1:0.8593
roc_auc:0.9211


In [84]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.95      0.78      0.86       148

    accuracy                           1.00     85443
   macro avg       0.98      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443



# CatBoost

In [85]:
from catboost import CatBoostClassifier

In [86]:
cat_boost = CatBoostClassifier(iterations=2000, learning_rate=0.03, depth=8, scale_pos_weight=5, random_state=42, cat_features=[])
cat_boost.fit(X_train, y_train, verbose=100)

0:	learn: 0.5896264	total: 40.8ms	remaining: 1m 21s
100:	learn: 0.0070329	total: 3.67s	remaining: 1m 8s
200:	learn: 0.0050723	total: 6.38s	remaining: 57.1s
300:	learn: 0.0039056	total: 8.99s	remaining: 50.8s
400:	learn: 0.0029882	total: 11.7s	remaining: 46.6s
500:	learn: 0.0023708	total: 13.9s	remaining: 41.7s
600:	learn: 0.0019245	total: 16.2s	remaining: 37.6s
700:	learn: 0.0015888	total: 18.5s	remaining: 34.3s
800:	learn: 0.0013451	total: 20.7s	remaining: 31s
900:	learn: 0.0011646	total: 23s	remaining: 28.1s
1000:	learn: 0.0010906	total: 24.6s	remaining: 24.6s
1100:	learn: 0.0009641	total: 26.8s	remaining: 21.8s
1200:	learn: 0.0008779	total: 28.8s	remaining: 19.2s
1300:	learn: 0.0007984	total: 30.9s	remaining: 16.6s
1400:	learn: 0.0007381	total: 32.9s	remaining: 14.1s
1500:	learn: 0.0007092	total: 34.7s	remaining: 11.5s
1600:	learn: 0.0006525	total: 37.5s	remaining: 9.36s
1700:	learn: 0.0006149	total: 40.6s	remaining: 7.14s
1800:	learn: 0.0005822	total: 49.6s	remaining: 5.48s
1900:	l

<catboost.core.CatBoostClassifier at 0x162eef572c0>

In [87]:
y_test_pred = cat_boost.predict(X_test)
y_test_proba = cat_boost.predict_proba(X_test)[:, 1]  

y_train_pred = cat_boost.predict(X_train)
y_train_proba = cat_boost.predict_proba(X_train)[:, 1]  

In [88]:
metrics = get_metrics(y_train, y_train_pred, y_train_proba)
for i in metrics:
    print(f'{i}_train:{metrics[i]:.4f}')

accuracy_train:0.9999
precision_train:0.9556
recall_train:1.0000
f1_train:0.9773
roc_auc_train:1.0000


In [89]:
metrics = get_metrics(y_test, y_test_pred, y_test_proba)
for i in metrics:
    print(f'{i}:{metrics[i]:.4f}')

accuracy:0.9995
precision:0.9200
recall:0.7770
f1:0.8425
roc_auc:0.9542


In [90]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.92      0.78      0.84       148

    accuracy                           1.00     85443
   macro avg       0.96      0.89      0.92     85443
weighted avg       1.00      1.00      1.00     85443



# Optimization with parameters

In [36]:
cat_boost = CatBoostClassifier(loss_function='Logloss', random_state=42, verbose=0)

In [37]:
cat_param_grid = {
    'iterations': [1000, 1500, 2000, 2500, 3000],
    'depth':[6, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05],
    'l2_leaf_reg': [1, 3, 5, 7, 10],
    'border_count': [32, 64, 128, 256],
    'scale_pos_weight': [3, 5, 7, 10] # Это для балансировки классов 
}

In [38]:
from sklearn.model_selection import RandomizedSearchCV

In [39]:
search = RandomizedSearchCV(cat_boost, param_distributions=cat_param_grid, n_iter=20, scoring='f1', cv=3, verbose=1, n_jobs=-1)

In [40]:
search.fit(X_train, y_train)
best_parameters = search.best_params_
print(f'Best pararameters for our CatBoost model are: {best_parameters}')

Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

In [91]:
cat_boost.save_model('catboost_model.cbm')
