In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [7]:
df = pd.read_csv('raw_data/train.csv')
print(f'Shape: {df.shape[0]} rows, {df.shape[1]} columns')

Shape: 215258 rows, 122 columns


## **Random Forest**

In [55]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from imblearn.over_sampling import SMOTENC

df = df.copy()
df = df.dropna()
print(f'shape: {df.shape[0]}')
# Features and target variable
X = df.drop('TARGET', axis=1)  # Features
y = df['TARGET']  # Target variable

for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('category')

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Apply SMOTE to balance the training data
cat_features = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype.name == 'category']
smote = SMOTENC(categorical_features=cat_features, random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
print("Class distribution after SMOTE:")
print(y_train_smote.value_counts())

model = xgb.XGBClassifier(
    enable_categorical=True,
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective='binary:logistic',
    eval_metric='auc',
    tree_method='hist',
    random_state=42
)

# Train
model.fit(X_train_smote, y_train_smote)

# ---------------------------
# 3. Predictions
# ---------------------------
# Train predictions
y_train_pred = model.predict(X_train)
y_train_proba = model.predict_proba(X_train)[:, 1]

# Test predictions
y_test_pred = model.predict(X_test)
# Convert to pandas Series to use value_counts()
y_test_pred_series = pd.Series(y_test_pred)

print("\nðŸ”¢ Prediction Counts:")
print(y_test_pred_series.value_counts())
y_test_proba = model.predict_proba(X_test)[:, 1]

# ---------------------------
# 4. Evaluation
# ---------------------------
train_acc = accuracy_score(y_train, y_train_pred)
test_acc  = accuracy_score(y_test, y_test_pred)

train_auc = roc_auc_score(y_train, y_train_proba)
test_auc  = roc_auc_score(y_test, y_test_proba)

print("ðŸ“Œ XGBoost Results")
print("----------------------------")
print(f"Train Accuracy : {train_acc:.4f}")
print(f"Test Accuracy  : {test_acc:.4f}")
print(f"Train ROC-AUC  : {train_auc:.4f}")
print(f"Test ROC-AUC   : {test_auc:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

shape: 5961
Class distribution after SMOTE:
TARGET
0    4481
1    4481
Name: count, dtype: int64

ðŸ”¢ Prediction Counts:
0    1181
1      12
Name: count, dtype: int64
ðŸ“Œ XGBoost Results
----------------------------
Train Accuracy : 0.9830
Test Accuracy  : 0.9413
Train ROC-AUC  : 0.9989
Test ROC-AUC   : 0.6519

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1131
           1       0.17      0.03      0.05        62

    accuracy                           0.94      1193
   macro avg       0.56      0.51      0.51      1193
weighted avg       0.91      0.94      0.92      1193



## **XGBoost**

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import pandas as pd

# ---------------------------
# 1. Prepare data
# ---------------------------
df= pd.read_csv('train.csv')
df = df.copy()

X = df.drop(columns=['TARGET'])
y = df['TARGET']
# Convert all object columns to category
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('category')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# 2. XGBoost Model
# ---------------------------
model = xgb.XGBClassifier(
    enable_categorical=True,
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective='binary:logistic',
    eval_metric='auc',
    tree_method='hist',
    random_state=42
)

# Train
model.fit(X_train, y_train)

# ---------------------------
# 3. Predictions
# ---------------------------
# Train predictions
y_train_pred = model.predict(X_train)
y_train_proba = model.predict_proba(X_train)[:, 1]

# Test predictions
y_test_pred = model.predict(X_test)
# Convert to pandas Series to use value_counts()
y_test_pred_series = pd.Series(y_test_pred)

print("\nðŸ”¢ Prediction Counts:")
print(y_test_pred_series.value_counts())
y_test_proba = model.predict_proba(X_test)[:, 1]

# ---------------------------
# 4. Evaluation
# ---------------------------
train_acc = accuracy_score(y_train, y_train_pred)
test_acc  = accuracy_score(y_test, y_test_pred)

train_auc = roc_auc_score(y_train, y_train_proba)
test_auc  = roc_auc_score(y_test, y_test_proba)

print("ðŸ“Œ XGBoost Results")
print("----------------------------")
print(f"Train Accuracy : {train_acc:.4f}")
print(f"Test Accuracy  : {test_acc:.4f}")
print(f"Train ROC-AUC  : {train_auc:.4f}")
print(f"Test ROC-AUC   : {test_auc:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))



ðŸ”¢ Prediction Counts:
0    42930
1      122
Name: count, dtype: int64
ðŸ“Œ XGBoost Results
----------------------------
Train Accuracy : 0.9213
Test Accuracy  : 0.9194
Train ROC-AUC  : 0.8228
Test ROC-AUC   : 0.7552

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     39576
           1       0.52      0.02      0.04      3476

    accuracy                           0.92     43052
   macro avg       0.72      0.51      0.50     43052
weighted avg       0.89      0.92      0.88     43052



## **Logistic Regression**

In [57]:

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, precision_score, recall_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# ---------------------------
# 1. Prepare data
# ---------------------------
df = pd.read_csv('train.csv')
df = df.copy()
df = df.dropna()

X = df.drop(columns=['TARGET'])
y = df['TARGET']

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Ensure categorical dtype for object columns (optional)
for col in categorical_cols:
    X[col] = X[col].astype('category')
X = X[numeric_cols]
# Train-test split (stratify to preserve target ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
smote = SMOTE( random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# ---------------------------
# 2. Preprocessing + Logistic Regression Pipeline
# ---------------------------

clf = Pipeline(steps=[
    ('clf', LogisticRegression(max_iter=1000, solver='saga', n_jobs=-1, class_weight='balanced'))
])

# Train the model
clf.fit(X_train_smote, y_train_smote)

# ---------------------------
# 3. Predictions
# ---------------------------
y_train_pred = clf.predict(X_train)
y_train_proba = clf.predict_proba(X_train)[:, 1]

y_test_pred = clf.predict(X_test)
y_test_proba = clf.predict_proba(X_test)[:, 1]

# ---------------------------
# 4. Evaluation
# ---------------------------
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)


print("\n==============================")
print("ðŸ“Œ MODEL EVALUATION RESULTS")
print("==============================")

# ----- Train Metrics -----
print("\nðŸ”¹ TRAIN PERFORMANCE")
print(f"Accuracy      : {train_acc:.4f}")
print(f"ROC-AUC       : {train_auc:.4f}")

print("\n  â€¢ Class 0 Metrics")
print(f"Precision (0) : {precision_score(y_train, y_train_pred, pos_label=0):.4f}")
print(f"Recall (0)    : {recall_score(y_train, y_train_pred, pos_label=0):.4f}")
print(f"F1 Score (0)  : {f1_score(y_train, y_train_pred, pos_label=0):.4f}")

print("\n  â€¢ Class 1 Metrics")
print(f"Precision (1) : {precision_score(y_train, y_train_pred, pos_label=1):.4f}")
print(f"Recall (1)    : {recall_score(y_train, y_train_pred, pos_label=1):.4f}")
print(f"F1 Score (1)  : {f1_score(y_train, y_train_pred, pos_label=1):.4f}")

# ----- Test Metrics -----
print("\nðŸ”¹ TEST PERFORMANCE")
print(f"Accuracy      : {test_acc:.4f}")
print(f"ROC-AUC       : {test_auc:.4f}")

print("\n  â€¢ Class 0 Metrics")
print(f"Precision (0) : {precision_score(y_test, y_test_pred, pos_label=0):.4f}")
print(f"Recall (0)    : {recall_score(y_test, y_test_pred, pos_label=0):.4f}")
print(f"F1 Score (0)  : {f1_score(y_test, y_test_pred, pos_label=0):.4f}")

print("\n  â€¢ Class 1 Metrics")
print(f"Precision (1) : {precision_score(y_test, y_test_pred, pos_label=1):.4f}")
print(f"Recall (1)    : {recall_score(y_test, y_test_pred, pos_label=1):.4f}")
print(f"F1 Score (1)  : {f1_score(y_test, y_test_pred, pos_label=1):.4f}")

# ----- Confusion Matrix -----
print("\nðŸ”¹ CONFUSION MATRIX (Test Set)")
cm = confusion_matrix(y_test, y_test_pred)
print(cm)

# ----- Classification Report -----
print("\nðŸ”¹ CLASSIFICATION REPORT (Test Set)")
print(classification_report(y_test, y_test_pred))


ðŸ“Œ MODEL EVALUATION RESULTS

ðŸ”¹ TRAIN PERFORMANCE
Accuracy      : 0.6013
ROC-AUC       : 0.6110

  â€¢ Class 0 Metrics
Precision (0) : 0.9556
Recall (0)    : 0.6046
F1 Score (0)  : 0.7406

  â€¢ Class 1 Metrics
Precision (1) : 0.0794
Recall (1)    : 0.5484
F1 Score (1)  : 0.1386

ðŸ”¹ TEST PERFORMANCE
Accuracy      : 0.5767
ROC-AUC       : 0.6115

  â€¢ Class 0 Metrics
Precision (0) : 0.9478
Recall (0)    : 0.5824
F1 Score (0)  : 0.7215

  â€¢ Class 1 Metrics
Precision (1) : 0.0676
Recall (1)    : 0.4857
F1 Score (1)  : 0.1187

ðŸ”¹ CONFUSION MATRIX (Test Set)
[[654 469]
 [ 36  34]]

ðŸ”¹ CLASSIFICATION REPORT (Test Set)
              precision    recall  f1-score   support

           0       0.95      0.58      0.72      1123
           1       0.07      0.49      0.12        70

    accuracy                           0.58      1193
   macro avg       0.51      0.53      0.42      1193
weighted avg       0.90      0.58      0.69      1193



In [58]:

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, precision_score, recall_score
import pandas as pd
import numpy as np

# ---------------------------
# 1. Prepare data
# ---------------------------
df = pd.read_csv('train.csv')
df = df.copy()
df = df.dropna()

X = df.drop(columns=['TARGET'])
y = df['TARGET']

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Ensure categorical dtype for object columns (optional)
for col in categorical_cols:
    X[col] = X[col].astype('category')

# Train-test split (stratify to preserve target ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
smote = SMOTENC(categorical_features=categorical_cols, random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# ---------------------------
# 2. Preprocessing + Logistic Regression Pipeline
# ---------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ],
)

clf = Pipeline(steps=[
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, solver='saga', n_jobs=-1, class_weight='balanced'))
])

# Train the model
clf.fit(X_train_smote, y_train_smote)

# ---------------------------
# 3. Predictions
# ---------------------------
y_train_pred = clf.predict(X_train)
y_train_proba = clf.predict_proba(X_train)[:, 1]

y_test_pred = clf.predict(X_test)
y_test_proba = clf.predict_proba(X_test)[:, 1]

# ---------------------------
# 4. Evaluation
# ---------------------------
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)


print("\n==============================")
print("ðŸ“Œ MODEL EVALUATION RESULTS")
print("==============================")

# ----- Train Metrics -----
print("\nðŸ”¹ TRAIN PERFORMANCE")
print(f"Accuracy      : {train_acc:.4f}")
print(f"ROC-AUC       : {train_auc:.4f}")

print("\n  â€¢ Class 0 Metrics")
print(f"Precision (0) : {precision_score(y_train, y_train_pred, pos_label=0):.4f}")
print(f"Recall (0)    : {recall_score(y_train, y_train_pred, pos_label=0):.4f}")
print(f"F1 Score (0)  : {f1_score(y_train, y_train_pred, pos_label=0):.4f}")

print("\n  â€¢ Class 1 Metrics")
print(f"Precision (1) : {precision_score(y_train, y_train_pred, pos_label=1):.4f}")
print(f"Recall (1)    : {recall_score(y_train, y_train_pred, pos_label=1):.4f}")
print(f"F1 Score (1)  : {f1_score(y_train, y_train_pred, pos_label=1):.4f}")

# ----- Test Metrics -----
print("\nðŸ”¹ TEST PERFORMANCE")
print(f"Accuracy      : {test_acc:.4f}")
print(f"ROC-AUC       : {test_auc:.4f}")

print("\n  â€¢ Class 0 Metrics")
print(f"Precision (0) : {precision_score(y_test, y_test_pred, pos_label=0):.4f}")
print(f"Recall (0)    : {recall_score(y_test, y_test_pred, pos_label=0):.4f}")
print(f"F1 Score (0)  : {f1_score(y_test, y_test_pred, pos_label=0):.4f}")

print("\n  â€¢ Class 1 Metrics")
print(f"Precision (1) : {precision_score(y_test, y_test_pred, pos_label=1):.4f}")
print(f"Recall (1)    : {recall_score(y_test, y_test_pred, pos_label=1):.4f}")
print(f"F1 Score (1)  : {f1_score(y_test, y_test_pred, pos_label=1):.4f}")

# ----- Confusion Matrix -----
print("\nðŸ”¹ CONFUSION MATRIX (Test Set)")
cm = confusion_matrix(y_test, y_test_pred)
print(cm)

# ----- Classification Report -----
print("\nðŸ”¹ CLASSIFICATION REPORT (Test Set)")
print(classification_report(y_test, y_test_pred))


ðŸ“Œ MODEL EVALUATION RESULTS

ðŸ”¹ TRAIN PERFORMANCE
Accuracy      : 0.8612
ROC-AUC       : 0.6902

  â€¢ Class 0 Metrics
Precision (0) : 0.9516
Recall (0)    : 0.8982
F1 Score (0)  : 0.9241

  â€¢ Class 1 Metrics
Precision (1) : 0.1394
Recall (1)    : 0.2652
F1 Score (1)  : 0.1827

ðŸ”¹ TEST PERFORMANCE
Accuracy      : 0.8466
ROC-AUC       : 0.6273

  â€¢ Class 0 Metrics
Precision (0) : 0.9493
Recall (0)    : 0.8842
F1 Score (0)  : 0.9156

  â€¢ Class 1 Metrics
Precision (1) : 0.1156
Recall (1)    : 0.2429
F1 Score (1)  : 0.1567

ðŸ”¹ CONFUSION MATRIX (Test Set)
[[993 130]
 [ 53  17]]

ðŸ”¹ CLASSIFICATION REPORT (Test Set)
              precision    recall  f1-score   support

           0       0.95      0.88      0.92      1123
           1       0.12      0.24      0.16        70

    accuracy                           0.85      1193
   macro avg       0.53      0.56      0.54      1193
weighted avg       0.90      0.85      0.87      1193

