In [20]:
# Implementation of different ML models
# Random Forest (supervised)
# Gradient boosting - XGBoost Clasifier (supervised)
# Linear SVC (supervised)
# Logistic Regression (supervised)

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression

In [22]:
# Load data from CSV file
dataset = pd.read_csv("creditcard_dataset.csv")

In [23]:
# Set preparation by divide to features (X) and labels (y)
X = dataset.drop(columns=["Class"])  # Remove 'Class' column
y = dataset["Class"]  # Extract 'Class' column

In [24]:
# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [25]:
# ML MODELS #

In [26]:
# RANDOM FOREST
# Initialize and train a Random Forest model
model_RF = RandomForestClassifier(n_estimators=100, random_state=42)  # Możesz dostosować liczbę drzew
model_RF.fit(X_train, y_train)

# Prediction on test set
y_pred_RF = model_RF.predict(X_test)

# Model evaluation
accuracy_RF = accuracy_score(y_test, y_pred_RF)
print(f"Model accuracy: {accuracy_RF:.4f}")

# Display classification report
print("\nClassification report:")
print(classification_report(y_test, y_pred_RF))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_RF))

Dokładność modelu: 0.9996

Raport klasyfikacji:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.82      0.87        98

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56859     5]
 [   18    80]]


In [27]:
# XGBoost Classifier
# # Initialize and train a XGBoost Classifier modl
model_XGB = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42)

# Train the model
model_XGB.fit(X_train, y_train)

# Make predictions
y_pred_XGB = model_XGB.predict(X_test)

# Evaluate performance
accuracy_XGB  = accuracy_score(y_test, y_pred_XGB)
print(f"Model Accuracy:: {accuracy_XGB:.4f}")
print("\nRaport klasyfikacji:")
print(classification_report(y_test, y_pred_XGB))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_XGB))

Dokładność modelu: 0.9996

Raport klasyfikacji:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.90      0.84      0.87        98

    accuracy                           1.00     56962
   macro avg       0.95      0.92      0.93     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56855     9]
 [   16    82]]


In [28]:
# Linear SVC
# Scale features for better performance (important for SVM)
scaler = StandardScaler()

# Initialize the Linear SVC model
model_Lsvc = make_pipeline(scaler, LinearSVC(C=1.0, max_iter=1000, random_state=42))

# Train the model
model_Lsvc.fit(X_train, y_train)

# Make predictions
y_pred_Lsvc = model_Lsvc.predict(X_test)

# Evaluate performance
accuracy_Lsvc = accuracy_score(y_test, y_pred_Lsvc)
print(f"Model Accuracy: {accuracy_Lsvc:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred_Lsvc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_Lsvc))

Model Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.59      0.69        98

    accuracy                           1.00     56962
   macro avg       0.91      0.80      0.85     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56852    12]
 [   40    58]]


In [29]:
# Logistic Regression
# Scale features for better performance
scaler = StandardScaler()

# Initialize Logistic Regression model
model_LR = make_pipeline(scaler, LogisticRegression(solver="liblinear", class_weight="balanced", random_state=42))

# Train the model
model_LR.fit(X_train, y_train)

# Make predictions
y_pred_LR = model_LR.predict(X_test)

# Evaluate performance
accuracy_LR = accuracy_score(y_test, y_pred_LR)
print(f"Model Accuracy: {accuracy_LR:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred_LR))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_LR))

Model Accuracy: 0.98
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962

Confusion Matrix:
 [[55479  1385]
 [    8    90]]
