In [10]:
# ==========================================
# EXPERIMENT check models on delivered dataset
# ==========================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

# MODELS
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [11]:
def set_dataset(method: str):
    """
    Loads the appropriate dataset and sets the dataset description
    based on the specified augmentation method.

    Parameters
    ----------
    method : str
        One of ["original", "smote", "adasyn", "rus", "easyensemble"]

    Returns
    -------
    df : pd.DataFrame
        The loaded dataset.
    dataset_description : str
        A description of the dataset.
    """

    method = method.lower()

    dataset_map = {
        "original": ("original_dataset", "creditcard_dataset.csv"),
        "smote": ("SMOTE_augmentation", "creditcard_dataset_smote.csv"),
        "adasyn": ("ADASYN_augmentation", "creditcard_dataset_adasyn.csv"),
        "rus": ("RUS_augmentation", "creditcard_dataset_rus.csv"),
        "easyensemble": ("EasyEnsemble_augmentation", "creditcard_dataset_easy_ensemble.csv")
    }

    if method not in dataset_map:
        raise ValueError(f"Unknown method '{method}'. Expected one of: {list(dataset_map.keys())}")

    dataset_description, filename = dataset_map[method]
    df = pd.read_csv(filename)

    return df, dataset_description

In [12]:
# Choose dataset by parameter
# original, smote, adasyn, rus, easyensemble

df, dataset_description = set_dataset("original")
print("Loaded: ", dataset_description)

Loaded:  original_dataset


In [13]:
# See classes in dataset
print("Rozkład klas oryginalny:", Counter(df['Class']))

Rozkład klas oryginalny: Counter({0: 284315, 1: 492})


In [14]:
# Data separation to features and labels
X = df.drop('Class', axis=1)
y = df['Class']

In [15]:
# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Rozkład klas w zbiorze treningowym:", Counter(y_train))

Rozkład klas w zbiorze treningowym: Counter({0: 227451, 1: 394})


In [16]:
# Models definition
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42, n_jobs=-1),
    "Logistic Regression": LogisticRegression(max_iter=5000, class_weight='balanced', random_state=42, solver='lbfgs'),
    "Linear SVC": LinearSVC(class_weight='balanced', max_iter=2000, random_state=42)
}

In [17]:
# Training and model validation after SMOTE
results = {}

for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"=== Results for model {name} ===")
    print(confusion_matrix(y_test, y_pred))
    report = classification_report(y_test, y_pred, target_names=["Correct", "Frauds"])
    print(report)
    
    results[name] = report


Model: Random Forest
=== Results for model Random Forest ===
[[56859     5]
 [   18    80]]
              precision    recall  f1-score   support

     Correct       1.00      1.00      1.00     56864
      Frauds       0.94      0.82      0.87        98

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962


Model: XGBoost
=== Results for model XGBoost ===
[[56852    12]
 [   20    78]]
              precision    recall  f1-score   support

     Correct       1.00      1.00      1.00     56864
      Frauds       0.87      0.80      0.83        98

    accuracy                           1.00     56962
   macro avg       0.93      0.90      0.91     56962
weighted avg       1.00      1.00      1.00     56962


Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


=== Results for model Logistic Regression ===
[[55435  1429]
 [    8    90]]
              precision    recall  f1-score   support

     Correct       1.00      0.97      0.99     56864
      Frauds       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962


Model: Linear SVC
=== Results for model Linear SVC ===
[[55695  1169]
 [    8    90]]
              precision    recall  f1-score   support

     Correct       1.00      0.98      0.99     56864
      Frauds       0.07      0.92      0.13        98

    accuracy                           0.98     56962
   macro avg       0.54      0.95      0.56     56962
weighted avg       1.00      0.98      0.99     56962



In [21]:
# Saving results to 
file_name = "v2_results_" + dataset_description + ".txt"
with open(file_name, "w", encoding="utf-8") as f:
    for name, report in results.items():
        f.write(f"\n=== {name} ===\n")
        f.write(report)
        f.write("\n" + "="*60 + "\n")

print("\n Experiment with dataset: " + dataset_description + " finished - see results in file: " + file_name)


 Experiment with dataset: original_dataset finished - see results in file: v2_results_original_dataset.txt
