In [2]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv("../datasets/creditcard_2023.csv")

In [24]:
features = np.array(df.columns[:-1])
label = np.array(df.columns[-1])
data = df.values
X = data[: , :-1]
y = data[: , -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


In [5]:
kf = KFold(n_splits = 5)

### Старт Ансамбля

In [6]:
def scores(clf, x_tr, y_tr, x_te, y_te):
    clf.fit(x_tr, y_tr)
    pred = clf.predict(x_te)
    print('Precision:',precision_score(y_te, pred))
    print('Recall:',recall_score(y_te, pred))
    print('F1:',f1_score(y_te, pred))
    print('Confusion Matrix (tn, fp, fn, tp):',confusion_matrix(y_te, pred).ravel())

In [7]:
# function to return metafeatures for classifier clf, data x, labels y with kf k-fold
def get_metafeatures(clf, x, y, kf):
    meta_feat = np.zeros((len(x),))
    for train_index, test_index in kf.split(x, y):
        X_tr, X_te = x[train_index], x[test_index]
        y_tr, y_te = y[train_index], y[test_index]
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_te)
        meta_feat[test_index] = y_pred
    return meta_feat.reshape(-1, 1)

In [8]:
# best KNN classifier after hyperparameter tuning based on F1 score
b_knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
pred_train_knn = get_metafeatures(b_knn, X_train, y_train, kf)
pred_test_knn = get_metafeatures(b_knn, X_test, y_test, kf)
scores(b_knn, X_train, y_train, X_test, y_test)

found 0 physical cores < 1
  File "C:\Users\Redmi\Documents\Programming\Python\MachineLearning\venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Precision: 0.9980193169838244
Recall: 0.9983807910687844
F1: 0.998200021301523
Confusion Matrix (tn, fp, fn, tp): [93589   186   152 93721]


In [15]:
# best RF classifier after hyperparameter tuning based on F1 score
b_rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            min_samples_leaf=2, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=42, verbose=0,
            warm_start=False)
pred_train_rf = get_metafeatures(b_rf, X_train, y_train, kf)
pred_test_rf = get_metafeatures(b_rf, X_test, y_test, kf)
scores(b_rf, X_train, y_train, X_test, y_test)

Precision: 0.9997230330016831
Recall: 0.9997336827415764
F1: 0.9997283578432677
Confusion Matrix (tn, fp, fn, tp): [93749    26    25 93848]


In [18]:
# best XGB classifier after hyperparameter tuning based on F1 score
b_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=0, n_estimators=210,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
pred_train_xgb = get_metafeatures(b_xgb, X_train, y_train, kf)
pred_test_xgb = get_metafeatures(b_xgb, X_test, y_test, kf)
scores(b_xgb, X_train, y_train, X_test, y_test)

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



Precision: 0.9997762768201479
Recall: 0.9997017246705655
F1: 0.9997389993554883
Confusion Matrix (tn, fp, fn, tp): [93754    21    28 93845]


In [14]:
# best LR classifier after hyperparameter tuning based on F1 score
b_lr = LogisticRegression(C=0.11, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=78, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
pred_train_lr = get_metafeatures(b_lr, X_train, y_train, kf)
pred_test_lr = get_metafeatures(b_lr, X_test, y_test, kf)
scores(b_lr, X_train, y_train, X_test, y_test)

Precision: 0.9984210470165257
Recall: 0.99693202518296
F1: 0.9976759805123504
Confusion Matrix (tn, fp, fn, tp): [93627   148   288 93585]


In [19]:
X_train = np.hstack((X_train, pred_train_knn))
X_test = np.hstack((X_test, pred_test_knn))
print(np.shape(X_train), np.shape(X_test))

(380982, 31) (187648, 31)


In [20]:
X_train = np.hstack((X_train, pred_train_rf))
X_test = np.hstack((X_test, pred_test_rf))
print(np.shape(X_train), np.shape(X_test))

(380982, 32) (187648, 32)


In [21]:
X_train = np.hstack((X_train, pred_train_xgb))
X_test = np.hstack((X_test, pred_test_xgb))
print(np.shape(X_train), np.shape(X_test))

(380982, 33) (187648, 33)


In [22]:
X_train = np.hstack((X_train, pred_train_lr))
X_test = np.hstack((X_test, pred_test_lr))
print(np.shape(X_train), np.shape(X_test))

(380982, 34) (187648, 34)


In [None]:
###

In [23]:
ens_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=0, n_estimators=210,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
scores(ens_xgb, X_train, y_train, X_test, y_test)

Parameters: { "silent" } are not used.



Precision: 0.9998934515311015
Recall: 0.9996910719802286
F1: 0.9997922515141668
Confusion Matrix (tn, fp, fn, tp): [93765    10    29 93844]
