In [2]:

import numpy as np # linear algebra
import pandas as pd


# Classifier Libraries
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
import collections


# Other Librariest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, auc, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedShuffleSplit


In [12]:
from anomguard.ml_logic.preprocessing  import preprocessing_smote

In [8]:
data1 = pd.read_csv('../raw_data/creditcard.csv')

In [10]:
df = data1

In [28]:
df['Hour'] = (df['Time'] // 3600) % 24

In [32]:
 #Separate features and target variable
X = df.drop(columns=['Class'])
y = df['Class']

# # Split data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [68]:
X_train_transformed.shape
y_test.shape

(56962,)

In [69]:
X_test_transformed.shape

(56962, 32)

In [70]:
y_train_smote.shape

(272941,)

In [38]:
X_train_transformed.shape

(272941, 32)

In [72]:
X_test.shape

(56962, 31)

In [73]:
model_logreg_prepro1 = LogisticRegression(
    class_weight='balanced',  # Handle imbalance class_weight='balanced'    automatically compensates for class imbalance.

    max_iter=1000,
    random_state=42
)
model_logreg_prepro1.fit(X_train_transformed, y_train_smote)

In [None]:
X_train_transformed.columns

Index(['Time', 'Log_Amount', 'Hour_sin', 'Hour_cos', 'V1', 'V2', 'V3', 'V4',
       'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15',
       'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25',
       'V26', 'V27', 'V28'],
      dtype='object')

In [66]:
#  Compute Recallfrom
from sklearn.metrics import recall_score, precision_recall_curve, auc

# Predict labels
y_pred = model_logreg_prepro1.predict(X_train_transformed)
recall_logreg_prepro1 = recall_score(y_train_smote, y_pred)

# Compute PR AUC (Precision-Recall AUC)
y_probs = model_logreg_prepro1.predict_proba(X_train_transformed)[:, 1]
precision, recall_curve, _ = precision_recall_curve(y_train_smote, y_probs)
pr_auc = auc(recall_curve, precision)

# Print results
print(f"Recall(LogReg V1.0): {recall_logreg_prepro1:.4f}")  
print(f"PR AUC(LogReg V1.0): {pr_auc:.4f}")


Recall(LogReg V1.0): 0.9703
PR AUC(LogReg V1.0): 0.9929


In [67]:
import xgboost as xgb

# Create the XGBoost model
model_xgb_preprov1 = xgb.XGBClassifier(
    objective="binary:logistic",  
    scale_pos_weight=len(y_train_smote[y_train_smote == 0]) / len(y_train_smote[y_train_smote == 1]),  
    
    
    eval_metric="logloss",  # This checks how good the model is (lower is better)
    random_state=42,  # This makes sure we get the same results every time we run the model
    use_label_encoder=False  # This removes a warning message
)

# Train (fit) the model with the training data
model_xgb_preprov1.fit(X_train_transformed, y_train_smote)

In [74]:
# Make predictions on the test data
y_pred_xgb = model_xgb_preprov1.predict(X_train_transformed)

# Calculate Recall
recall_xgb = recall_score(y_train_smote, y_pred_xgb)

y_probs_xgb = model_xgb_preprov1.predict_proba(X_train_transformed)[:, 1]
precision, recall_curve, _ = precision_recall_curve(y_train_smote, y_probs_xgb)

pr_auc_xgb = auc(recall_curve, precision)

# Print results
print(f"Recall (XGBoost V1.0): {recall_xgb:.4f}")
print(f"PR AUC (XGBoost V1.0): {pr_auc_xgb:.4f}")

Recall (XGBoost V1.0): 1.0000
PR AUC (XGBoost V1.0): 1.0000


In [78]:
y_pred_xgb_test = model_xgb_preprov1.predict(X_test_transformed)
recall_xgb_test = recall_score(y_test, y_pred_xgb_test)

print(f"Train Recall xgb: {recall_xgb:.2f}")
print(f"Test Recall xgb: {recall_xgb_test:.2f}")

Train Recall xgb: 1.00
Test Recall xgb: 0.85


In [79]:
y_probs_xgb_test = model_xgb_preprov1.predict_proba(X_test_transformed)[:, 1]
precision_test, recall_curve_test, _ = precision_recall_curve(y_test, y_probs_xgb_test)
pr_auc_xgb_test = auc(recall_curve_test, precision_test)

print(f"Train PR-AUC xgb: {pr_auc_xgb:.2f}")
print(f"Test PR-AUC xgb: {pr_auc_xgb_test:.2f}")

Train PR-AUC xgb: 1.00
Test PR-AUC xgb: 0.87


In [75]:
from sklearn.ensemble import RandomForestClassifier

# Create the model
model_rf_prepro1 = RandomForestClassifier(
    n_estimators=70,  # Number of trees
    class_weight="balanced",  # Adjust for class imbalance
    random_state=42  # Ensure reproducibility
)

# Train the model
model_rf_prepro1.fit(X_train_transformed, y_train_smote)

# Predict on test data
y_pred_rf_prepro1 = model_rf_prepro1.predict(X_test)

# Compute recall
recall_rf_prepro1 = recall_score(y_train_smote, y_pred_rf_prepro1)

# Compute PR AUC
y_probs_rf_prepro1 = model_rf_prepro1.predict_proba(X_test)[:, 1]
precision, recall_curve, _ = precision_recall_curve(y_test, y_probs_rf_prepro1)
pr_auc_rf_prepro1 = auc(recall_curve, precision)

# Print results
print(f"Recall (Random Forest prepro1.0): {recall_rf_prepro1:.4f}")
print(f"PR AUC (Random Forest prepro1.0): {pr_auc_rf_prepro1:.4f}")

KeyboardInterrupt: 

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import recall_score, precision_recall_curve, auc
import numpy as np

# Create the TabNet model with name prepro15
model_tabnet_prepro1 = TabNetClassifier(
    optimizer_params=dict(lr=0.02),  # Learning rate (how fast the model learns)
    seed=42  # Fix the random seed for the same results every time
)

# Convert data to NumPy (TabNet works with NumPy, not Pandas)
X_train_np = X_train_smote.to_numpy()
y_train_np = y_train_smote.to_numpy().ravel()  # Fix the shape issue
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy().ravel()  # Fix the shape issue

# Train the model
model_tabnet_prepro1.fit(
    X_train_np, y_train_np,
    eval_set=[(X_test_np, y_test_np)],  # Check model performance on test data
    eval_metric=['logloss'],  # Use log loss to check errors
    max_epochs=15,  # Train for 100 rounds
    patience=5,  # Stop early if no improvement for 10 rounds
    batch_size=1024,  # Number of examples in each training step
    virtual_batch_size=128,  # For faster learning
    num_workers=0  # Number of CPU cores used (0 means auto)
)

In [None]:
# Make predictions on test data
y_pred_tabnet_prepro1 = model_tabnet_prepro1.predict(X_test_np)

# Calculate Recall
recall_tabnet_prepro1 = recall_score(y_test_np, y_pred_tabnet_prepro1)
# Recall tells us how many positive cases we found correctly

# Get probabilities for PR AUC calculation
y_probs_tabnet_prepro1 = model_tabnet_prepro1.predict_proba(X_test_np)[:, 1]
# predict_proba() gives probabilities for both classes
# [:, 1] means we take only the probability for the positive class (1)

# Calculate Precision-Recall Curve
precision, recall_curve, _ = precision_recall_curve(y_test_np, y_probs_tabnet_prepro1)

# Compute PR AUC (Area Under the Precision-Recall Curve)
pr_auc_tabnet_prepro1 = auc(recall_curve, precision)

# Print results
print(f"Recall (TabNet prepro1.0): {recall_tabnet_prepro1:.4f}")
print(f"PR AUC (TabNet prepro1.0): {pr_auc_tabnet_prepro1:.4f}")


In [1]:
print(f"Recall(LogReg V1.0): {recall_logreg_prepro1:.4f}")  # Исправлено
print(f"PR AUC(LogReg V1.0): {pr_auc:.4f}")
print(f"Recall (XGBoost V1.0): {recall_xgb:.4f}")
print(f"PR AUC (XGBoost V1.0): {pr_auc_xgb:.4f}")
print(f"Recall (Random Forest prepro15): {recall_rf_prepro1:.4f}")
print(f"PR AUC (Random Forest prepro15): {pr_auc_rf_prepro1:.4f}")

print(f"Recall (TabNet prepro15): {recall_tabnet_prepro1:.4f}")
print(f"PR AUC (TabNet prepro15): {pr_auc_tabnet_prepro1:.4f}")


NameError: name 'recall_logreg_prepro1' is not defined