In [67]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    recall_score, precision_score, f1_score
)
from scipy.stats import loguniform
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint

In [68]:
df = pd.read_csv("creditcard.csv")


In [69]:
df.shape

(284807, 31)

In [70]:
df = df.sample(n = 100000, random_state = 42, replace = False)


In [71]:
X = df.drop(columns=['Class'])
y = df['Class'].astype(int)

In [72]:
# ---- 2) Podział train/test (zachowujemy proporcje klas) ----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [73]:
oversample = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = oversample.fit_resample(X_train, y_train)


In [74]:
results = []
def evaluation(model, X_train, X_test, y_train, y_test):
    # Predykcje
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print("=" * 50)
    print("CONFUSION MATRIX - TRAIN")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    print("Accuracy of train data:", round(accuracy_score(y_train, y_train_pred), 4))
    print("F1-score of train data:", round(f1_score(y_train, y_train_pred, average='macro', zero_division=0), 4))
    print("=" * 50)

    print("CONFUSION MATRIX - TEST")
    print(confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))

    acc = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred, average='macro', zero_division=0)
    precision = precision_score(y_test, y_test_pred, zero_division=0)
    recall = recall_score(y_test, y_test_pred, zero_division=0)

    print("Accuracy of test data:", round(acc, 4))
    print("F1-score of test data:", round(f1, 4))
    print("Precision of test data:", round(precision, 4))
    print("Recall of test data:", round(recall, 4))
    print("=" * 50)

    results.append({
    "Model": model,
    "Precision": precision,
    "Recall": recall,
    "Accuracy": acc,
    "F1": f1

    })


    # Zwróć metryki
    return {
        "Model": model.__class__.__name__,
        "Precision": precision,
        "Recall": recall,
        "Accuracy": acc,
        "F1": f1,
    }


In [47]:
random_search = RandomizedSearchCV(
    LogisticRegression(max_iter=1500, solver='saga'),
    param_distributions={
        'C': loguniform(0.001, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'class_weight': [None, 'balanced'],
        'l1_ratio': [0, 0.5, 1]
    },
    n_iter=30,
    scoring='recall',
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_resampled, y_train_resampled)
print("Best parameters:", random_search.best_params_)

Najlepsze parametry: {'C': np.float64(0.2537815508265665), 'class_weight': 'balanced', 'l1_ratio': 1, 'penalty': 'l2'}


In [78]:
best_lr = Pipeline([
    ("lr", LogisticRegression(
        C=0.25378,                 # z wyniku
        penalty="l2",
        class_weight="balanced",
        solver="lbfgs",            # szybki i poprawny dla L2
        max_iter=800,
        random_state=42
    ))
])
best_lr.fit(X_train, y_train)

0,1,2
,steps,"[('lr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.25378
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,800


In [79]:
evaluation(best_lr, X_train, X_test, y_train, y_test)


CONFUSION MATRIX - TRAIN
[[77357  2514]
 [   12   117]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     79871
           1       0.04      0.91      0.08       129

    accuracy                           0.97     80000
   macro avg       0.52      0.94      0.53     80000
weighted avg       1.00      0.97      0.98     80000

Accuracy of train data: 0.9684
F1-score of train data: 0.5344
CONFUSION MATRIX - TEST
[[19339   629]
 [    1    31]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     19968
           1       0.05      0.97      0.09        32

    accuracy                           0.97     20000
   macro avg       0.52      0.97      0.54     20000
weighted avg       1.00      0.97      0.98     20000

Accuracy of test data: 0.9685
F1-score of test data: 0.5368
Precision of test data: 0.047
Recall of test data: 0.9688


{'Model': 'Pipeline',
 'Precision': 0.04696969696969697,
 'Recall': 0.96875,
 'Accuracy': 0.9685,
 'F1': 0.5367840519601829}

In [65]:
# --- KNN + SMOTE + StandardScaler + GridSearchCV (bez leaku) ---

# Pipeline: SMOTE (tylko na foldach train) -> KNN
knn_pipe = ImbPipeline(steps=[
    ("smote", SMOTE(random_state=42, sampling_strategy=0.1)),
    ("knn", KNeighborsClassifier())
])

# Siatka hiperparametrów
knn_param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11, 13],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["minkowski"],
    "knn__p": [1, 2],
}

# Walidacja skratyfikowana
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Kilka metryk; model końcowy refit po PR-AUC
scoring = {
    "pr_auc": "average_precision",   # Precision-Recall AUC
    "recall": "recall",
    "precision": "precision",
    "f1": "f1",
    "roc_auc": "roc_auc"
}

knn_grid = GridSearchCV(
    estimator=knn_pipe,
    param_grid=knn_param_grid,
    scoring=scoring,
    refit="recall",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

# Uczymy na oryginalnym train (SMOTE wykona się wewnątrz foldów)
knn_grid.fit(X_train, y_train)

best_knn = knn_grid.best_estimator_
print("Najlepsze parametry:", knn_grid.best_params_)
print("CV best PR-AUC:", knn_grid.best_score_)
bi = knn_grid.best_index_
cvres = knn_grid.cv_results_
print("CV (dla best): Recall={:.4f}, F1={:.4f}, Precision={:.4f}, ROC-AUC={:.4f}".format(
    cvres["mean_test_recall"][bi],
    cvres["mean_test_f1"][bi],
    cvres["mean_test_precision"][bi],
    cvres["mean_test_roc_auc"][bi],
))



Fitting 5 folds for each of 24 candidates, totalling 120 fits
Najlepsze parametry: {'knn__metric': 'minkowski', 'knn__n_neighbors': 3, 'knn__p': 1, 'knn__weights': 'uniform'}
CV best PR-AUC: 0.4036923076923077
CV (dla best): Recall=0.4037, F1=0.1377, Precision=0.0833, ROC-AUC=0.7392


In [80]:
# Ewaluacja na teście
evaluation(best_knn, X_train, X_test, y_train, y_test)


CONFUSION MATRIX - TRAIN
[[79600   271]
 [    3   126]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     79871
           1       0.32      0.98      0.48       129

    accuracy                           1.00     80000
   macro avg       0.66      0.99      0.74     80000
weighted avg       1.00      1.00      1.00     80000

Accuracy of train data: 0.9966
F1-score of train data: 0.7387
CONFUSION MATRIX - TEST
[[19821   147]
 [   20    12]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     19968
           1       0.08      0.38      0.13        32

    accuracy                           0.99     20000
   macro avg       0.54      0.68      0.56     20000
weighted avg       1.00      0.99      0.99     20000

Accuracy of test data: 0.9917
F1-score of test data: 0.5607
Precision of test data: 0.0755
Recall of test data: 0.375


{'Model': 'Pipeline',
 'Precision': 0.07547169811320754,
 'Recall': 0.375,
 'Accuracy': 0.99165,
 'F1': 0.5607297094937226}

In [58]:
# === Decision Tree + SMOTE + RandomizedSearchCV (Recall-focused, fast) ===

# Pipeline: SMOTE -> DecisionTree
dt_pipe = ImbPipeline(steps=[
    ("smote", SMOTE(random_state=42, sampling_strategy=0.1)),
    ("dt", DecisionTreeClassifier(random_state=42))
])

# Losowe rozkłady hiperparametrów (sensowne zakresy, szybkie przeszukiwanie)
param_dist = {
    "dt__criterion": ["gini", "entropy", "log_loss"],
    "dt__max_depth": randint(4, 20),          # ograniczamy przeuczenie i czas
    "dt__min_samples_split": randint(2, 25),
    "dt__min_samples_leaf": randint(1, 15),
    "dt__max_features": [None, "sqrt", "log2"],
    "dt__class_weight": [None, "balanced"],
}

# CV i metryki – optymalizujemy pod Recall (minimizacja FN),
# ale logujemy też PR-AUC, F1, Precision, ROC-AUC
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "recall": "recall",
    "pr_auc": "average_precision",
    "f1": "f1",
    "precision": "precision",
    "roc_auc": "roc_auc",
}

dt_random = RandomizedSearchCV(
    estimator=dt_pipe,
    param_distributions=param_dist,
    n_iter=50,
    scoring=scoring,
    refit="recall",            # model końcowy: maksymalny Recall
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
)

# Trenuj na ORYGINALNYM train (SMOTE wewnątrz foldów)
dt_random.fit(X_train, y_train)

best_dtc = dt_random.best_estimator_
print("Najlepsze parametry:", dt_random.best_params_)
print("CV best Recall:", dt_random.best_score_)

# Pokaż metryki CV dla najlepszego
bi = dt_random.best_index_
cvres = dt_random.cv_results_
print("CV (best): PR-AUC={:.4f} | F1={:.4f} | Precision={:.4f} | ROC-AUC={:.4f}".format(
    cvres["mean_test_pr_auc"][bi],
    cvres["mean_test_f1"][bi],
    cvres["mean_test_precision"][bi],
    cvres["mean_test_roc_auc"][bi],
))

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Najlepsze parametry: {'dt__class_weight': 'balanced', 'dt__criterion': 'entropy', 'dt__max_depth': 9, 'dt__max_features': 'sqrt', 'dt__min_samples_leaf': 12, 'dt__min_samples_split': 22}
CV best Recall: 0.8529230769230768
CV (best): PR-AUC=0.6756 | F1=0.1259 | Precision=0.0682 | ROC-AUC=0.9265


In [81]:
# Ewaluacja na TEŚCIE
evaluation(best_dtc, X_train, X_test, y_train, y_test)

CONFUSION MATRIX - TRAIN
[[78383  1488]
 [    4   125]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     79871
           1       0.08      0.97      0.14       129

    accuracy                           0.98     80000
   macro avg       0.54      0.98      0.57     80000
weighted avg       1.00      0.98      0.99     80000

Accuracy of train data: 0.9813
F1-score of train data: 0.567
CONFUSION MATRIX - TEST
[[19579   389]
 [    4    28]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     19968
           1       0.07      0.88      0.12        32

    accuracy                           0.98     20000
   macro avg       0.53      0.93      0.56     20000
weighted avg       1.00      0.98      0.99     20000

Accuracy of test data: 0.9804
F1-score of test data: 0.5574
Precision of test data: 0.0671
Recall of test data: 0.875


{'Model': 'Pipeline',
 'Precision': 0.0671462829736211,
 'Recall': 0.875,
 'Accuracy': 0.98035,
 'F1': 0.5573925329642611}

In [82]:
results_df = pd.DataFrame(results).sort_values(by="Recall", ascending=False)

In [83]:
round(results_df, 10 )

Unnamed: 0,Model,Precision,Recall,Accuracy,F1
1,"(LogisticRegression(C=0.25378, class_weight='b...",0.04697,0.96875,0.9685,0.536784
3,"(SMOTE(random_state=42, sampling_strategy=0.1)...",0.067146,0.875,0.98035,0.557393
0,"(SMOTE(random_state=42, sampling_strategy=0.1)...",0.075472,0.375,0.99165,0.56073
2,"(SMOTE(random_state=42, sampling_strategy=0.1)...",0.075472,0.375,0.99165,0.56073


In [38]:
# Podsumowanie:
# Model o najwyższej wartości Recall wykrywa największą liczbę rzeczywistych przypadków oszustw (fraudów),
# nawet kosztem większej liczby fałszywych alarmów (False Positives).
# Ponieważ w tym zadaniu priorytetem jest minimalizacja liczby pominiętych fraudów (False Negatives),
# kluczową metryką oceny modeli jest właśnie Recall.
# Wyniki modeli zostały posortowane według tej miary (malejąco),
# dlatego jako model końcowy należy wybrać ten, który znajduje się na pierwszym miejscu w tabeli wyników.
