In [34]:
# ==========================================
# EXPERIMENT stage 2 – Test models with data augmentation (Random Undersampling Implementation, RUS)
# ==========================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

from imblearn.under_sampling import RandomUnderSampler

# MODELS
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [35]:
# Load data from CSV file
df = pd.read_csv("creditcard_dataset.csv")
print("Rozkład klas oryginalny:", Counter(df['Class']))

Rozkład klas oryginalny: Counter({0: 284315, 1: 492})


In [36]:
# Data separation to features and labels
X = df.drop('Class', axis=1)
y = df['Class']

In [37]:
# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Rozkład klas w zbiorze treningowym:", Counter(y_train))

Rozkład klas w zbiorze treningowym: Counter({0: 227451, 1: 394})


In [38]:
# Random Undersampling Implementation, RUS
rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X, y)

print("\nRozkład klas po undersamplingu:")
print(y_train_res.value_counts())


Rozkład klas po undersamplingu:
Class
0    492
1    492
Name: count, dtype: int64


In [39]:
# Models definition
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=10000, solver='saga',class_weight='balanced', random_state=42),
    "Linear SVC": LinearSVC(class_weight='balanced', max_iter=2000, random_state=42)
}

In [40]:
# Training and model validation after SMOTE
results = {}

for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)

    print(f"=== Results for model {name} ===")
    print(confusion_matrix(y_test, y_pred))
    report = classification_report(y_test, y_pred, target_names=["Correct", "Frauds"])
    print(report)
    
    results[name] = report


Model: Random Forest
=== Results for model Random Forest ===
[[55465  1399]
 [    0    98]]
              precision    recall  f1-score   support

     Correct       1.00      0.98      0.99     56864
      Frauds       0.07      1.00      0.12        98

    accuracy                           0.98     56962
   macro avg       0.53      0.99      0.56     56962
weighted avg       1.00      0.98      0.99     56962


Model: XGBoost
=== Results for model XGBoost ===
[[54929  1935]
 [    0    98]]
              precision    recall  f1-score   support

     Correct       1.00      0.97      0.98     56864
      Frauds       0.05      1.00      0.09        98

    accuracy                           0.97     56962
   macro avg       0.52      0.98      0.54     56962
weighted avg       1.00      0.97      0.98     56962


Model: Logistic Regression
=== Results for model Logistic Regression ===
[[49030  7834]
 [   56    42]]
              precision    recall  f1-score   support

     Correct

In [41]:
# Saving results to file
with open("results_RUS.txt", "w", encoding="utf-8") as f:
    for name, report in results.items():
        f.write(f"\n=== {name} ===\n")
        f.write(report)
        f.write("\n" + "="*60 + "\n")

print("\n✅ Experiment with RUS data augmentation finished - see results_RUS.txt")


✅ Experiment with RUS data augmentation finished - see results_RUS.txt
