In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [76]:
extracted_features_undersampled_train_df = pd.read_csv(os.path.join('./Data', "extracted_features_undersampled_train_data.csv"))
extracted_features_oversampled_train_df = pd.read_csv(os.path.join('./Data', "extracted_features_oversampled_train_data.csv"))
extracted_features_train_df = pd.read_csv(os.path.join('./Data', "extracted_features_train_data.csv"))
train_df = pd.read_csv(os.path.join('./Data', "train_data.csv"))
test_df = pd.read_csv(os.path.join('./Data', "test_data.csv"))

In [77]:
target_features = ['V3', 'V4', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18']
X_test = test_df[target_features].values
y_test = test_df['Class'].values

In [78]:
def test_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    print(f"--- {name} ---")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [None]:
def plot_roc_curves(models_proba, y_test):
    plt.figure(figsize=(10, 8))

    for name, y_proba in models_proba.items():
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Comparison of ROC Curves')
    plt.legend(loc='lower right')
    plt.grid()
    plt.tight_layout()
    plt.show()

# Oversampled Models

In [79]:
X_train = extracted_features_oversampled_train_df.drop('Class', axis=1).values
y_train = extracted_features_oversampled_train_df['Class'].values

In [80]:
dtree = DecisionTreeClassifier(
    criterion='gini',       
    max_depth=15,           # control tree depth to prevent overfitting
    min_samples_split=50,   # minimum samples required to split an internal node
    min_samples_leaf=20,    # minimum samples required to be at a leaf node
    random_state=42         
)
oversampled_dtree_model = dtree.fit(X_train, y_train)

knn = KNeighborsClassifier(
    n_neighbors=9,        # Number of neighbors
    weights='distance',    # 'uniform' or 'distance'
    algorithm='auto',     # Search algorithm
    p=2                   # 2 for Euclidean distance
)
oversampled_knn_model = knn.fit(X_train, y_train)

xgb = XGBClassifier(
    n_estimators=200,       # Number of boosting rounds
    max_depth=10,            # Maximum tree depth
    learning_rate=0.1,      # Step size shrinkage
    random_state=42,
    eval_metric='logloss'
)
oversampled_xgb_model = xgb.fit(X_train, y_train)

log_reg = LogisticRegression(
    penalty='l2',         # Regularization type
    C=0.001,                # Inverse regularization strength
    solver='liblinear',   # Solver for small datasets
    random_state=42
)
oversampled_log_reg_model = log_reg.fit(X_train, y_train)

rf = RandomForestClassifier(
    n_estimators=100,       # Number of trees
    max_depth=5,           # Max depth
    min_samples_split=50,   # Minimum samples to split
    min_samples_leaf=50,    # Minimum samples at leaf
    random_state=42
)
oversampled_rf_model = rf.fit(X_train, y_train)

In [81]:
# Apply on all models
test_model("Decision Tree", oversampled_dtree_model, X_test, y_test)
test_model("KNN", oversampled_knn_model, X_test, y_test)
test_model("XGBoost", oversampled_xgb_model, X_test, y_test)
test_model("Logistic Regression", oversampled_log_reg_model, X_test, y_test)
test_model("Random Forest", oversampled_rf_model, X_test, y_test)

--- Decision Tree ---
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     56864
           1       0.12      0.85      0.21        98

    accuracy                           0.99     56962
   macro avg       0.56      0.92      0.60     56962
weighted avg       1.00      0.99      0.99     56962

Confusion Matrix:
[[56246   618]
 [   15    83]]
--- KNN ---
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56864
           1       0.22      0.89      0.36        98

    accuracy                           0.99     56962
   macro avg       0.61      0.94      0.68     56962
weighted avg       1.00      0.99      1.00     56962

Confusion Matrix:
[[56559   305]
 [   11    87]]
--- XGBoost ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.48      0.89      0.62        98

    accuracy                      

In [None]:
models_proba = {
    "Logistic Regression": oversampled_log_reg_model.predict_proba(X_test)[:, 1],
    "Random Forest": oversampled_rf_model.predict_proba(X_test)[:, 1],
    "Decision Tree": oversampled_dtree_model.predict_proba(X_test)[:, 1],
    "KNN": oversampled_knn_model.predict_proba(X_test)[:, 1],
    "XGBoost": oversampled_xgb_model.predict_proba(X_test)[:, 1]
}
plot_roc_curves(models_proba, y_test)

# Undersampled Models

In [82]:
X_train = extracted_features_undersampled_train_df.drop('Class', axis=1).values
y_train = extracted_features_undersampled_train_df['Class'].values

In [83]:
dtree = DecisionTreeClassifier(
    criterion='gini',       
    max_depth=15,           # control tree depth to prevent overfitting
    min_samples_split=50,   # minimum samples required to split an internal node
    min_samples_leaf=10,    # minimum samples required to be at a leaf node
    random_state=42         
)
undersampled_dtree_model = dtree.fit(X_train, y_train)

knn = KNeighborsClassifier(
    n_neighbors=7,        # Number of neighbors
    weights='distance',    # 'uniform' or 'distance'
    algorithm='auto',     # Search algorithm
    p=2                   # 2 for Euclidean distance
)
undersampled_knn_model = knn.fit(X_train, y_train)

xgb = XGBClassifier(
    n_estimators=200,       # Number of boosting rounds
    max_depth=10,            # Maximum tree depth
    learning_rate=0.1,      # Step size shrinkage
    random_state=42,
    eval_metric='logloss'
)
undersampled_xgb_model = xgb.fit(X_train, y_train)

log_reg = LogisticRegression(
    penalty='l2',         # Regularization type
    C=0.01,                # Inverse regularization strength
    solver='liblinear',   # Solver for small datasets
    random_state=42
)
undersampled_log_reg_model = log_reg.fit(X_train, y_train)

rf = RandomForestClassifier(
    n_estimators=100,       # Number of trees
    max_depth=5,           # Max depth
    min_samples_split=50,   # Minimum samples to split
    min_samples_leaf=50,    # Minimum samples at leaf
    random_state=42
)
undersampled_rf_model = rf.fit(X_train, y_train)

svm = SVC(
    kernel='linear',        # Kernel type: 'linear', 'poly', 'rbf', 'sigmoid'
    C=10,               # Regularization parameter
    gamma='scale',       # Kernel coefficient
    probability=True,    # Enable probability estimates if needed
    random_state=42
)
undersampled_svm_model = svm.fit(X_train, y_train)

In [84]:
# Apply on all models
test_model("Decision Tree", undersampled_dtree_model, X_test, y_test)
test_model("KNN", undersampled_knn_model, X_test, y_test)
test_model("XGBoost", undersampled_xgb_model, X_test, y_test)
test_model("Logistic Regression", undersampled_log_reg_model, X_test, y_test)
test_model("Random Forest", undersampled_rf_model, X_test, y_test)
test_model("SVM", undersampled_svm_model, X_test, y_test)

--- Decision Tree ---
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     56864
           1       0.03      0.90      0.05        98

    accuracy                           0.95     56962
   macro avg       0.51      0.92      0.51     56962
weighted avg       1.00      0.95      0.97     56962

Confusion Matrix:
[[53754  3110]
 [   10    88]]
--- KNN ---
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.05      0.91      0.10        98

    accuracy                           0.97     56962
   macro avg       0.53      0.94      0.54     56962
weighted avg       1.00      0.97      0.98     56962

Confusion Matrix:
[[55287  1577]
 [    9    89]]
--- XGBoost ---
              precision    recall  f1-score   support

           0       1.00      0.95      0.98     56864
           1       0.03      0.93      0.06        98

    accuracy                      

In [None]:
models_proba = {
    "Logistic Regression": undersampled_log_reg_model.predict_proba(X_test)[:, 1],
    "Random Forest": undersampled_rf_model.predict_proba(X_test)[:, 1],
    "Decision Tree": undersampled_dtree_model.predict_proba(X_test)[:, 1],
    "KNN": undersampled_knn_model.predict_proba(X_test)[:, 1],
    "XGBoost": undersampled_xgb_model.predict_proba(X_test)[:, 1],
    "SVM": undersampled_svm_model.predict_proba(X_test)[:, 1]
}
plot_roc_curves(models_proba, y_test)

# Original Models

In [85]:
X_train = extracted_features_train_df.drop('Class', axis=1).values
y_train = extracted_features_train_df['Class'].values

In [86]:
dtree = DecisionTreeClassifier(
    criterion='gini',       
    max_depth=15,           # control tree depth to prevent overfitting
    min_samples_split=50,   # minimum samples required to split an internal node
    min_samples_leaf=20,    # minimum samples required to be at a leaf node
    random_state=42         
)
dtree_model = dtree.fit(X_train, y_train)

knn = KNeighborsClassifier(
    n_neighbors=9,        # Number of neighbors
    weights='distance',    # 'uniform' or 'distance'
    algorithm='auto',     # Search algorithm
    p=2                   # 2 for Euclidean distance
)
knn_model = knn.fit(X_train, y_train)

xgb = XGBClassifier(
    n_estimators=200,       # Number of boosting rounds
    max_depth=10,            # Maximum tree depth
    learning_rate=0.1,      # Step size shrinkage
    random_state=42,
    eval_metric='logloss'
)
xgb_model = xgb.fit(X_train, y_train)

log_reg = LogisticRegression(
    penalty='l2',         # Regularization type
    C=0.001,                # Inverse regularization strength
    solver='liblinear',   # Solver for small datasets
    random_state=42
)
log_reg_model = log_reg.fit(X_train, y_train)

rf = RandomForestClassifier(
    n_estimators=100,       # Number of trees
    max_depth=5,           # Max depth
    min_samples_split=50,   # Minimum samples to split
    min_samples_leaf=50,    # Minimum samples at leaf
    random_state=42
)
rf_model = rf.fit(X_train, y_train)

In [87]:
# Apply on all models
test_model("Decision Tree", dtree_model, X_test, y_test)
test_model("KNN", knn_model, X_test, y_test)
test_model("XGBoost", xgb_model, X_test, y_test)
test_model("Logistic Regression", log_reg_model, X_test, y_test)
test_model("Random Forest", rf_model, X_test, y_test)

--- Decision Tree ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.71      0.82      0.76        98

    accuracy                           1.00     56962
   macro avg       0.86      0.91      0.88     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56832    32]
 [   18    80]]
--- KNN ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.71      0.88      0.79        98

    accuracy                           1.00     56962
   macro avg       0.86      0.94      0.89     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56829    35]
 [   12    86]]
--- XGBoost ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.71      0.83      0.76        98

    accuracy                      

In [None]:
models_proba = {
    "Logistic Regression": log_reg_model.predict_proba(X_test)[:, 1],
    "Random Forest": rf_model.predict_proba(X_test)[:, 1],
    "Decision Tree": dtree_model.predict_proba(X_test)[:, 1],
    "KNN": knn_model.predict_proba(X_test)[:, 1],
    "XGBoost": xgb_model.predict_proba(X_test)[:, 1]
}
plot_roc_curves(models_proba, y_test)

In [88]:
X_train = train_df.drop('Class', axis=1).values
y_train = train_df['Class'].values
X_test = test_df.drop('Class', axis=1).values
y_test = test_df['Class'].values

In [89]:
dtree = DecisionTreeClassifier(
    criterion='gini',       
    max_depth=15,           # control tree depth to prevent overfitting
    min_samples_split=50,   # minimum samples required to split an internal node
    min_samples_leaf=20,    # minimum samples required to be at a leaf node
    random_state=42         
)
dtree_model = dtree.fit(X_train, y_train)

knn = KNeighborsClassifier(
    n_neighbors=9,        # Number of neighbors
    weights='distance',    # 'uniform' or 'distance'
    algorithm='auto',     # Search algorithm
    p=2                   # 2 for Euclidean distance
)
knn_model = knn.fit(X_train, y_train)

xgb = XGBClassifier(
    n_estimators=200,       # Number of boosting rounds
    max_depth=10,            # Maximum tree depth
    learning_rate=0.1,      # Step size shrinkage
    random_state=42,
    eval_metric='logloss'
)
xgb_model = xgb.fit(X_train, y_train)

log_reg = LogisticRegression(
    penalty='l2',         # Regularization type
    C=0.001,                # Inverse regularization strength
    solver='liblinear',   # Solver for small datasets
    random_state=42
)
log_reg_model = log_reg.fit(X_train, y_train)

rf = RandomForestClassifier(
    n_estimators=100,       # Number of trees
    max_depth=5,           # Max depth
    min_samples_split=50,   # Minimum samples to split
    min_samples_leaf=50,    # Minimum samples at leaf
    random_state=42
)
rf_model = rf.fit(X_train, y_train)

In [90]:
# Apply on all models
test_model("Decision Tree", dtree_model, X_test, y_test)
test_model("KNN", knn_model, X_test, y_test)
test_model("XGBoost", xgb_model, X_test, y_test)
test_model("Logistic Regression", log_reg_model, X_test, y_test)
test_model("Random Forest", rf_model, X_test, y_test)

--- Decision Tree ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.82      0.72      0.77        98

    accuracy                           1.00     56962
   macro avg       0.91      0.86      0.88     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56848    16]
 [   27    71]]
--- KNN ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.93      0.77      0.84        98

    accuracy                           1.00     56962
   macro avg       0.96      0.88      0.92     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56858     6]
 [   23    75]]
--- XGBoost ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.93      0.77      0.84        98

    accuracy                      

In [None]:
models_proba = {
    "Logistic Regression": log_reg_model.predict_proba(X_test)[:, 1],
    "Random Forest": rf_model.predict_proba(X_test)[:, 1],
    "Decision Tree": dtree_model.predict_proba(X_test)[:, 1],
    "KNN": knn_model.predict_proba(X_test)[:, 1],
    "XGBoost": xgb_model.predict_proba(X_test)[:, 1]
}
plot_roc_curves(models_proba, y_test)