## prepro v1.5

In [1]:

import numpy as np # linear algebra
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Librariest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, auc, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedShuffleSplit


2025-03-13 11:56:50.571500: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data1 = pd.read_csv('../raw_data/creditcard.csv')


In [3]:
df = data1.copy()
df['Hour'] = (df['Time'] // 3600) % 24

In [4]:
# Separate features and target variable
X = df.drop(columns=['Class'])
y = df['Class']

# Split data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(sampling_strategy=0.2, random_state=42)  # Adjust ratio if needed
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [5]:
# Initialize RobustScaler
scaler = RobustScaler()

# Apply scaling only to 'Time' and 'Amount'
X_train_smote[['Time', 'Amount']] = scaler.fit_transform(X_train_smote[['Time', 'Amount']])
X_test[['Time', 'Amount']] = scaler.transform(X_test[['Time', 'Amount']])

In [6]:
# Log transform the 'Amount' column to reduce skewness
X_train_smote['Log_Amount'] = np.log1p(X_train_smote['Amount'])
X_test['Log_Amount'] = np.log1p(X_test['Amount'])

# Drop the original 'Amount' column if needed
X_train_smote.drop(columns=['Amount'], inplace=True)
X_test.drop(columns=['Amount'], inplace=True)


In [7]:

# Apply cyclical transformation
X_train_smote["Hour_sin"] = np.sin(2 * np.pi * X_train_smote["Hour"] / 24)
X_train_smote["Hour_cos"] = np.cos(2 * np.pi * X_train_smote["Hour"] / 24)

X_test["Hour_sin"] = np.sin(2 * np.pi * X_test["Hour"] / 24)
X_test["Hour_cos"] = np.cos(2 * np.pi * X_test["Hour"] / 24)

In [8]:
X_train_smote.drop(columns=["Hour"], inplace=True)
X_test.drop(columns=["Hour"], inplace=True)


In [9]:
X_train_smote['Class'] = y_train_smote
# Compute correlation matrix
correlation_matrix = X_train_smote.corr()

In [10]:
corr = X_train_smote.corr()['Class'].sort_values(ascending=False)


In [11]:
low_corr_features = ['V26', 'V22', 'V25', 'V23', 'V13', 'Time']
X_train_smote.drop(columns=low_corr_features, inplace=True)
X_test.drop(columns=low_corr_features, inplace=True)

In [12]:

# Compute the absolute correlation with the target column
target_corr = correlation_matrix['Class'].abs()

# Select upper triangle of correlation matrix to avoid redundancy
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find pairs of features with correlation greater than 0.85
high_corr_pairs = []
for column in upper.columns:
    high_corr_pairs += [(column, other) for other in upper.index if upper[column][other] > 0.85]

# For each pair of highly correlated features, drop the one with lower correlation to the target
columns_to_drop = []
for feature1, feature2 in high_corr_pairs:
    if abs(target_corr[feature1]) < abs(target_corr[feature2]):
        columns_to_drop.append(feature1)
    else:
        columns_to_drop.append(feature2)

# Drop the selected columns from X_train_smote
X_train_smote.drop(columns=columns_to_drop, inplace=True)
X_train_smote.drop(columns=['Class'], inplace=True)
X_test.drop(columns=columns_to_drop, inplace=True)



In [39]:
# Model LogisticRegression PreproV1.5

In [17]:
from sklearn.linear_model import LogisticRegression

# Train the model
model_logreg_prepro15 = LogisticRegression(
    class_weight='balanced',  # Handle imbalance class_weight='balanced'    automatically compensates for class imbalance.

    max_iter=1000,
    random_state=42
)
model_logreg_prepro15.fit(X_train_smote, y_train_smote)

In [13]:
X_train_smote.shape

(272941, 23)

In [14]:
X_test.shape

(56962, 23)

In [24]:
#  Compute Recallfrom
from sklearn.metrics import recall_score, precision_recall_curve, auc

# Predict labels
y_pred = model_logreg_prepro15.predict(X_test)
recall_logreg_prepro15 = recall_score(y_test, y_pred)

# Compute PR AUC (Precision-Recall AUC)
y_probs = model_logreg_prepro15.predict_proba(X_test)[:, 1]
precision, recall_curve, _ = precision_recall_curve(y_test, y_probs)
pr_auc = auc(recall_curve, precision)

# Print results
print(f"Recall(LogReg V1.5): {recall_logreg_prepro15:.4f}")  # Исправлено
print(f"PR AUC(LogReg V1.5): {pr_auc:.4f}")

Recall(LogReg V1.5): 0.9082
PR AUC(LogReg V1.5): 0.7220


In [21]:
# Create the XGBoost model PrePro V15
import xgboost as xgb

# Create the XGBoost model
model_xgb_preprov15 = xgb.XGBClassifier(
    objective="binary:logistic",  # This is for binary classification (yes/no)
    scale_pos_weight=len(y_train_smote[y_train_smote == 0]) / len(y_train_smote[y_train_smote == 1]),  
    # This helps when we have more examples of one class than the other (class imbalance)
    
    eval_metric="logloss",  # This checks how good the model is (lower is better)
    random_state=42,  # This makes sure we get the same results every time we run the model
    use_label_encoder=False  # This removes a warning message
)

# Train (fit) the model with the training data
model_xgb_preprov15.fit(X_train_smote, y_train_smote)


In [23]:
# Make predictions on the test data
y_pred_xgb = model_xgb_preprov15.predict(X_test)

# Calculate Recall
recall_xgb = recall_score(y_test, y_pred_xgb)
# Recall tells us how many positive cases we found correctly

# Get probabilities for PR AUC calculation
y_probs_xgb = model_xgb_preprov15.predict_proba(X_test)[:, 1]
# predict_proba() gives probabilities for both classes
# [:, 1] means we take only the probability for the positive class (1)

# Calculate Precision-Recall Curve
precision, recall_curve, _ = precision_recall_curve(y_test, y_probs_xgb)

# Compute PR AUC (Area Under the Precision-Recall Curve)
pr_auc_xgb = auc(recall_curve, precision)

# Print results
print(f"Recall (XGBoost V1.5): {recall_xgb:.4f}")
# Shows the recall score (higher is better)

print(f"PR AUC (XGBoost V1.5): {pr_auc_xgb:.4f}")
# Shows PR AUC (higher is better, closer to 1.0)

Recall (XGBoost V1.5): 0.8571
PR AUC (XGBoost V1.5): 0.8736


In [31]:
#Random Forest Model for Classification

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_recall_curve, auc

# Create the model
model_rf_prepro15 = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    class_weight="balanced",  # Adjust for class imbalance
    random_state=42  # Ensure reproducibility
)

# Train the model
model_rf_prepro15.fit(X_train_smote, y_train_smote)

# Predict on test data
y_pred_rf_prepro15 = model_rf_prepro15.predict(X_test)

# Compute recall
recall_rf_prepro15 = recall_score(y_test, y_pred_rf_prepro15)

# Compute PR AUC
y_probs_rf_prepro15 = model_rf_prepro15.predict_proba(X_test)[:, 1]
precision, recall_curve, _ = precision_recall_curve(y_test, y_probs_rf_prepro15)
pr_auc_rf_prepro15 = auc(recall_curve, precision)

# Print results
print(f"Recall (Random Forest prepro15): {recall_rf_prepro15:.4f}")
print(f"PR AUC (Random Forest prepro15): {pr_auc_rf_prepro15:.4f}")


Recall (Random Forest prepro15): 0.8061
PR AUC (Random Forest prepro15): 0.8682


In [27]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import recall_score, precision_recall_curve, auc
import numpy as np

# Create the TabNet model with name prepro15
model_tabnet_prepro15 = TabNetClassifier(
    optimizer_params=dict(lr=0.02),  # Learning rate (how fast the model learns)
    seed=42  # Fix the random seed for the same results every time
)

# Convert data to NumPy (TabNet works with NumPy, not Pandas)
X_train_np = X_train_smote.to_numpy()
y_train_np = y_train_smote.to_numpy().ravel()  # Fix the shape issue
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy().ravel()  # Fix the shape issue

# Train the model
model_tabnet_prepro15.fit(
    X_train_np, y_train_np,
    eval_set=[(X_test_np, y_test_np)],  # Check model performance on test data
    eval_metric=['logloss'],  # Use log loss to check errors
    max_epochs=5,  # Train for 100 rounds
    patience=5,  # Stop early if no improvement for 10 rounds
    batch_size=1024,  # Number of examples in each training step
    virtual_batch_size=128,  # For faster learning
    num_workers=0  # Number of CPU cores used (0 means auto)
)

epoch 0  | loss: 0.10512 | val_0_logloss: 0.02136 |  0:00:26s
epoch 1  | loss: 0.04604 | val_0_logloss: 0.01717 |  0:00:55s
epoch 2  | loss: 0.03811 | val_0_logloss: 0.02178 |  0:01:16s
epoch 3  | loss: 0.03072 | val_0_logloss: 0.01046 |  0:01:43s
epoch 4  | loss: 0.02583 | val_0_logloss: 0.01754 |  0:02:10s
Stop training because you reached max_epochs = 5 with best_epoch = 3 and best_val_0_logloss = 0.01046


In [None]:
# Make predictions on test data
y_pred_tabnet_prepro15 = model_tabnet_prepro15.predict(X_test_np)

# Calculate Recall
recall_tabnet_prepro15 = recall_score(y_test_np, y_pred_tabnet_prepro15)
# Recall tells us how many positive cases we found correctly

# Get probabilities for PR AUC calculation
y_probs_tabnet_prepro15 = model_tabnet_prepro15.predict_proba(X_test_np)[:, 1]
# predict_proba() gives probabilities for both classes
# [:, 1] means we take only the probability for the positive class (1)

# Calculate Precision-Recall Curve
precision, recall_curve, _ = precision_recall_curve(y_test_np, y_probs_tabnet_prepro15)

# Compute PR AUC (Area Under the Precision-Recall Curve)
pr_auc_tabnet_prepro15 = auc(recall_curve, precision)

# Print results
print(f"Recall (TabNet prepro15): {recall_tabnet_prepro15:.4f}")
print(f"PR AUC (TabNet prepro15): {pr_auc_tabnet_prepro15:.4f}")


Recall (TabNet prepro15): 0.8776
PR AUC (TabNet prepro15): 0.8058


In [33]:
print(f"Recall(LogReg V1.5): {recall_logreg_prepro15:.4f}")  # Исправлено
print(f"PR AUC(LogReg V1.5): {pr_auc:.4f}")
print(f"Recall (XGBoost V1.5): {recall_xgb:.4f}")
print(f"PR AUC (XGBoost V1.5): {pr_auc_xgb:.4f}")
print(f"Recall (Random Forest prepro15): {recall_rf_prepro15:.4f}")
print(f"PR AUC (Random Forest prepro15): {pr_auc_rf_prepro15:.4f}")

print(f"Recall (TabNet prepro15): {recall_tabnet_prepro15:.4f}")
print(f"PR AUC (TabNet prepro15): {pr_auc_tabnet_prepro15:.4f}")



Recall(LogReg V1.5): 0.9082
PR AUC(LogReg V1.5): 0.7220
Recall (XGBoost V1.5): 0.8571
PR AUC (XGBoost V1.5): 0.8736
Recall (Random Forest prepro15): 0.8061
PR AUC (Random Forest prepro15): 0.8682
Recall (TabNet prepro15): 0.8776
PR AUC (TabNet prepro15): 0.8058
