In [21]:
# ==========================================
# EXPERIMENT stage 2 – Test models no data augmentation
# ==========================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter

# MODELS
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [22]:
# Load data from CSV file
df = pd.read_csv("creditcard_dataset.csv")
print("Rozkład klas oryginalny:", Counter(df['Class']))

Rozkład klas oryginalny: Counter({0: 284315, 1: 492})


In [23]:
# Data separation to features and labels
X = df.drop('Class', axis=1)
y = df['Class']

In [24]:
# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Rozkład klas w zbiorze treningowym:", Counter(y_train))

Rozkład klas w zbiorze treningowym: Counter({0: 227451, 1: 394})


In [25]:
# Models definition
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=5000, class_weight='balanced', random_state=42),
    "Linear SVC": LinearSVC(class_weight='balanced', max_iter=2000, random_state=42)
}

In [26]:
# Training and model validation after SMOTE
results = {}

for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"=== Results for model {name} ===")
    print(confusion_matrix(y_test, y_pred))
    report = classification_report(y_test, y_pred, target_names=["Correct", "Frauds"])
    print(report)
    
    results[name] = report


Model: Random Forest
=== Results for model Random Forest ===
[[56859     5]
 [   18    80]]
              precision    recall  f1-score   support

     Correct       1.00      1.00      1.00     56864
      Frauds       0.94      0.82      0.87        98

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962


Model: XGBoost
=== Results for model XGBoost ===
[[56852    12]
 [   20    78]]
              precision    recall  f1-score   support

     Correct       1.00      1.00      1.00     56864
      Frauds       0.87      0.80      0.83        98

    accuracy                           1.00     56962
   macro avg       0.93      0.90      0.91     56962
weighted avg       1.00      1.00      1.00     56962


Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


=== Results for model Logistic Regression ===
[[55435  1429]
 [    8    90]]
              precision    recall  f1-score   support

     Correct       1.00      0.97      0.99     56864
      Frauds       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962


Model: Linear SVC
=== Results for model Linear SVC ===
[[55695  1169]
 [    8    90]]
              precision    recall  f1-score   support

     Correct       1.00      0.98      0.99     56864
      Frauds       0.07      0.92      0.13        98

    accuracy                           0.98     56962
   macro avg       0.54      0.95      0.56     56962
weighted avg       1.00      0.98      0.99     56962



In [27]:
# Saving results to file
with open("results_no_augmentation.txt", "w", encoding="utf-8") as f:
    for name, report in results.items():
        f.write(f"\n=== {name} ===\n")
        f.write(report)
        f.write("\n" + "="*60 + "\n")

print("\n Experiment WITHOUT data augmentation finished - see results_no_augmentation.txt")


 Experiment WITHOUT data augmentation finished - see results_no_augmentation.txt
