In [1]:
# ==========================================
# EXPERIMENT stage 3 – Test models with data augmentation (ADASYN)
# ==========================================

from datetime import date

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

from imblearn.over_sampling import ADASYN

# MODELS
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
# Load data from CSV file
df = pd.read_csv("creditcard_dataset.csv")
print("Rozkład klas oryginalny:", Counter(df['Class']))

Rozkład klas oryginalny: Counter({0: 284315, 1: 492})


In [3]:
# Data separation to features and labels
X = df.drop('Class', axis=1)
y = df['Class']

In [4]:
# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Rozkład klas w zbiorze treningowym:", Counter(y_train))

Rozkład klas w zbiorze treningowym: Counter({0: 227451, 1: 394})


In [None]:
# 4️⃣ ADASYN
adasyn = ADASYN(random_state=42, n_neighbors=5)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)

print("Rozmiar danych po zastosowaniu ADASYN:")
print("Rozkład klas po SMOTE:", Counter(y_train_res))

found 0 physical cores < 1
  File "c:\Users\lzloc\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Rozmiar danych po zastosowaniu ADASYN:


NameError: name 'y_resampled' is not defined

In [None]:
# Models definition
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42, n_jobs=-1),
    "Logistic Regression": LogisticRegression(max_iter=5000, class_weight='balanced', random_state=42, solver='lbfgs'),
    "Linear SVC": LinearSVC(class_weight='balanced', max_iter=2000, random_state=42)
}

In [None]:
# Training and model validation after ADASYN
results = {}

for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)

    print(f"=== Results for model {name} ===")
    print(confusion_matrix(y_test, y_pred))
    report = classification_report(y_test, y_pred, target_names=["Correct", "Frauds"])
    print(report)
    
    results[name] = report


Model: Random Forest
=== Results for model Random Forest ===
[[56847    17]
 [   19    79]]
              precision    recall  f1-score   support

     Correct       1.00      1.00      1.00     56864
      Frauds       0.82      0.81      0.81        98

    accuracy                           1.00     56962
   macro avg       0.91      0.90      0.91     56962
weighted avg       1.00      1.00      1.00     56962


Model: XGBoost
=== Results for model XGBoost ===
[[56844    20]
 [   15    83]]
              precision    recall  f1-score   support

     Correct       1.00      1.00      1.00     56864
      Frauds       0.81      0.85      0.83        98

    accuracy                           1.00     56962
   macro avg       0.90      0.92      0.91     56962
weighted avg       1.00      1.00      1.00     56962


Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


=== Results for model Logistic Regression ===
[[56284   580]
 [    9    89]]
              precision    recall  f1-score   support

     Correct       1.00      0.99      0.99     56864
      Frauds       0.13      0.91      0.23        98

    accuracy                           0.99     56962
   macro avg       0.57      0.95      0.61     56962
weighted avg       1.00      0.99      0.99     56962


Model: Linear SVC
=== Results for model Linear SVC ===
[[55824  1040]
 [    9    89]]
              precision    recall  f1-score   support

     Correct       1.00      0.98      0.99     56864
      Frauds       0.08      0.91      0.15        98

    accuracy                           0.98     56962
   macro avg       0.54      0.94      0.57     56962
weighted avg       1.00      0.98      0.99     56962



In [None]:
# Saving results to file
dataset_description = "ADASYN"
today = date.today()
file_name = "v3_results_" + dataset_description + ".txt"
with open(file_name, "w", encoding="utf-8") as f:
    f.write(f"Report created at : {today}\n")
    f.write(f"Comment           : ADASYN augmentation\n")
    f.write(f"Rozkład klas w zbiorze treningowym (ADASYN): {Counter(y_train_res)}\n")
    for name, report in results.items():
        f.write(f"\n=== {name} ===\n")
        f.write(report)
        f.write("\n" + "="*60 + "\n")

print("\n Experiment with dataset: " + dataset_description + " finished - see results in file: " + file_name)