Data from: https://www.kaggle.com/datasets/whenamancodes/fraud-detection

In [2]:
import numpy as np
import pandas as pd

df_train = pd.read_csv("../data/creditcard_0_train.csv")
df_train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,1387.0,-0.247635,0.367930,0.853269,-0.727001,-0.107909,-0.220377,1.064465,-0.000631,-0.751445,...,-0.268825,-1.040942,0.556685,0.004071,-0.876036,0.444537,-0.065031,0.024997,126.85,0
1,16632.0,1.125704,0.060368,0.693077,1.418097,-0.218836,0.469390,-0.461809,0.268230,1.777269,...,-0.385354,-0.789926,0.055323,-0.434480,0.339990,-0.528869,0.010824,0.000923,8.62,0
2,205.0,1.182399,0.059489,0.310673,0.828817,0.341209,1.112593,-0.292778,0.287394,0.268307,...,-0.337818,-0.776840,-0.056486,-1.356986,0.475790,-0.546519,0.063035,0.008047,12.99,0
3,5164.0,-0.915487,-3.399440,-1.126437,1.612170,-1.190341,-0.070311,1.488157,-0.434074,1.182271,...,0.487133,-0.819884,-1.135484,0.044055,0.218116,0.401155,-0.300773,0.175775,1129.10,0
4,3377.0,-0.347934,0.633134,0.541116,-2.526809,1.148422,0.087046,1.016614,-0.114287,0.546241,...,0.059253,0.594505,-0.456344,-1.315068,0.177886,-0.745462,0.254809,-0.039400,1.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9867,20658.0,1.061662,-0.465585,1.226568,0.253467,-0.753072,0.919268,-1.025993,0.427126,2.303447,...,-0.275893,-0.305272,0.136545,-0.275365,-0.086951,0.945988,-0.036805,-0.008126,20.00,0
9868,4949.0,1.031677,-0.006787,0.568099,1.583664,-0.348881,0.030754,-0.214084,0.147572,1.621096,...,-0.187458,-0.287500,-0.047609,0.114755,0.495479,-0.377993,-0.011818,0.003753,48.31,0
9869,5345.0,-0.914827,0.834317,2.130190,0.935591,-0.460889,0.029705,0.331374,-0.503894,1.797079,...,0.010262,0.640675,-0.307134,0.445520,-0.420362,0.451446,-0.660535,-0.044895,59.90,0
9870,654.0,-0.833568,0.606174,-0.051329,-2.091447,0.968764,-0.030220,0.887288,0.099009,0.834622,...,0.320782,1.239559,0.192074,-0.621025,-0.843584,-0.835690,0.164384,0.274361,9.90,0


In [3]:
df_train.groupby("Class").agg(count=("V1", "count"))

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,9823
1,49


In [4]:
from imblearn.combine import SMOTEENN

X_train = df_train.drop(columns=["Time", "Class"])
y_train = df_train["Class"]
X_resampled, y_resampled = SMOTEENN(random_state=42).fit_resample(X_train, y_train)

X_resampled.shape, y_resampled.shape

((19497, 29), (19497,))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

param_grids = [
    {
        "algorithm": LogisticRegression(), 
        "params": [
            {
                "penalty": ["l1"],
                "solver": ["liblinear", "saga"],
                "C": np.logspace(-4, 4, 5)
            },
            {
                "penalty": ["l2"],
                "solver": ["lbfgs", "newton-cg", "liblinear", "sag", "saga"],
                "C": np.logspace(-4, 4, 5),
            },
            {
                "penalty": ["elasticnet"],
                "solver": ["saga"],
                "l1_ratio": [0.1, 0.5, 0.9], 
                "C": np.logspace(-4, 4, 5),
            }
        ]
    }, 
    {
        "algorithm": DecisionTreeClassifier(), 
        "params": [{
            "criterion": ["gini", "entropy"],
            "splitter": ["best", "random"],
            "max_depth": [10, 30, 50],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
            "max_features": ["sqrt", "log2"],
            "random_state": [42]
        }]
    }, 
    {
        "algorithm": RandomForestClassifier(), 
        "params": [{
            "n_estimators": [100, 200, 500],
            "max_depth": [10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"]
        }]
    }, 
    {
        "algorithm": SVC(), 
        "params": [
            {"kernel": ["linear"], "C": [0.1, 1, 10, 100]},
            {"kernel": ["rbf"], "C": [0.1, 1, 10, 100], "gamma": [1, 0.1, 0.01, 0.001, 0.0001]},
            {"kernel": ["poly"], "C": [0.1, 1, 10, 100], "degree": [2, 3, 4], "gamma": ["scale"]}
        ]
    }, 
    {
        "algorithm": GradientBoostingClassifier(), 
        "params": [{
            "n_estimators": [50, 100, 200], 
            "learning_rate": [0.01, 0.1, 0.2], 
            "max_depth": [3, 4, 5], 
            "min_samples_split": [2, 5, 10], 
            "min_samples_leaf": [1, 2, 4], 
            "subsample": [0.8, 1.0], 
            "max_features": ["sqrt", None]
        }]
    }, 
    {
        "algorithm": XGBClassifier(), 
        "params": [{
            "n_estimators": [100, 200, 500],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 4, 5, 6],
            "min_child_weight": [1, 5, 10],
            "gamma": [0.5, 1, 2],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.6, 0.8, 1.0],
            "reg_alpha": [0, 0.1, 0.5, 1], 
            "reg_lambda": [0, 0.1, 0.5, 1]
        }]
    }
]

In [7]:
from itertools import product
from sklearn.model_selection import cross_validate

for pg in param_grids:
    for p in pg["params"]:
        keys = p.keys()
        values = p.values()
        product_values = list(product(*values))
        for pv in product_values:
            grid = dict(zip(keys, pv))
            mdinst = pg["algorithm"].set_params(**grid)
            params = {"model_algorithm": mdinst.__class__.__name__}
            params.update(mdinst.get_params())

            cv_results = cross_validate(mdinst, X_resampled, y_resampled, scoring=["roc_auc", "accuracy", "precision", "recall", "f1"], cv=3)
            metrics = {k: v.mean() for k, v in cv_results.items()}
            print(metrics)

{'fit_time': np.float64(0.23224457105000815), 'score_time': np.float64(0.012665828069051107), 'test_roc_auc': np.float64(0.9996361477115282), 'test_accuracy': np.float64(0.9979484023183054), 'test_precision': np.float64(0.9986667913462504), 'test_recall': np.float64(0.9972341733251383), 'test_f1': np.float64(0.9979496952534804)}
{'fit_time': np.float64(0.15740116437276205), 'score_time': np.float64(0.01294271151224772), 'test_roc_auc': np.float64(0.9996388941239086), 'test_accuracy': np.float64(0.9979996922603478), 'test_precision': np.float64(0.9987694185455912), 'test_recall': np.float64(0.9972341733251383), 'test_f1': np.float64(0.9980008747248785)}
{'fit_time': np.float64(0.2044849395751953), 'score_time': np.float64(0.011842489242553711), 'test_roc_auc': np.float64(0.9996367790706961), 'test_accuracy': np.float64(0.9979996922603478), 'test_precision': np.float64(0.9987694185455912), 'test_recall': np.float64(0.9972341733251383), 'test_f1': np.float64(0.9980008747248785)}
{'fit_tim

KeyboardInterrupt: 

In [18]:
df_test = pd.read_csv("../data/creditcard_0_test.csv")
X_test = df_test.drop(columns=["Time", "Class"])
y_test = df_test["Class"]
y_pred = LogisticRegression().fit(X_resampled, y_resampled).predict(X_test)

y_test, y_pred

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0       0
 1       0
 2       0
 3       0
 4       0
        ..
 2463    0
 2464    0
 2465    0
 2466    0
 2467    0
 Name: Class, Length: 2468, dtype: int64,
 array([0, 0, 0, ..., 0, 0, 0], shape=(2468,)))