In [1]:
import pandas as pd
import numpy as np
# For visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.display.max_rows = None
pd.options.display.max_columns = None
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import recall_score

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/HumayDS/Big-data-analysis/main/Churn_Modelling.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,2,3,15619304,Onio,502,,Female,42,8,159660.8,3,1,0,113931.57,1
3,3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
Customer_id = df['CustomerId']

In [4]:
#Drop redundant columns
df = df.drop(['Unnamed: 0' , 'RowNumber' , 'CustomerId','Surname'] , axis = 1)

In [5]:
##Fill categoric column with mode
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Geography'] = df['Geography'].fillna(df['Geography'].mode()[0])

In [6]:
#Creating dummy variables(One hot encoding)
#Make sure you run it once
categorical_cols = df.select_dtypes(include='object').columns
df_dummies = pd.get_dummies(df[categorical_cols], drop_first=True, dtype=int)
df = df.drop(columns=categorical_cols)
df = pd.concat([df, df_dummies], axis=1)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [7]:
## FOR OVERFITTING CHECK WITH LOGISTIC REGRESSION
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# üéØ Define features (X) and target (y)
X = df.drop(columns=['Exited'])
y = df['Exited']

# 1Ô∏è‚É£ Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# 2Ô∏è‚É£ Logistic Regression
log_reg = LogisticRegression(
    max_iter=200,
    penalty='l2',
    solver='lbfgs'
)
log_model = log_reg.fit(X_train, y_train)

# 3Ô∏è‚É£ Predictions
pred_train = log_model.predict(X_train)
pred_test = log_model.predict(X_test)

# 4Ô∏è‚É£ Evaluate
acc_train = accuracy_score(y_train, pred_train)
acc_test = accuracy_score(y_test, pred_test)

# ---- CUSTOM CONFUSION MATRIX FORMAT ----
def custom_cm(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    cm_custom = pd.DataFrame(
        [[tp, fp],
         [fn, tn]],
        index=['Predicted 1', 'Predicted 0'],
        columns=['Actual 1', 'Actual 0']
    )
    return cm_custom

# 5Ô∏è‚É£ Print results
print(f"üîπ Train Accuracy: {acc_train:.4f}")
print(f"üîπ Test Accuracy:  {acc_test:.4f}")

print("\n=== Train Classification Report ===")
print(classification_report(y_train, pred_train))

print("\nTrain Confusion Matrix:")
print(custom_cm(y_train, pred_train))

print("\n=== Test Classification Report ===")
print(classification_report(y_test, pred_test))

print("\nTest Confusion Matrix:")
print(custom_cm(y_test, pred_test))

üîπ Train Accuracy: 0.8077
üîπ Test Accuracy:  0.8193

=== Train Classification Report ===
              precision    recall  f1-score   support

           0       0.82      0.96      0.89      5547
           1       0.61      0.21      0.31      1453

    accuracy                           0.81      7000
   macro avg       0.72      0.59      0.60      7000
weighted avg       0.78      0.81      0.77      7000


Train Confusion Matrix:
             Actual 1  Actual 0
Predicted 1       302       195
Predicted 0      1151      5352

=== Test Classification Report ===
              precision    recall  f1-score   support

           0       0.83      0.97      0.90      2416
           1       0.61      0.20      0.30       584

    accuracy                           0.82      3000
   macro avg       0.72      0.58      0.60      3000
weighted avg       0.79      0.82      0.78      3000


Test Confusion Matrix:
             Actual 1  Actual 0
Predicted 1       116        74
Predicte

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Random Forest

In [8]:
# FOR OVERFITTING
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# üéØ Define features (X) and target (y)
X = df.drop(columns=['Exited'])   # all other columns are features
y = df['Exited']                  # target variable

# 1Ô∏è‚É£ Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# 2Ô∏è‚É£ Initialize and train Random Forest model
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)
rf_model = rf.fit(X_train, y_train)

# 3Ô∏è‚É£ Make predictions for both train and test
predictions_train = rf_model.predict(X_train)
predictions_test = rf_model.predict(X_test)

# ---- CUSTOM CONFUSION MATRIX FORMAT ----
def custom_cm(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    cm_custom = pd.DataFrame(
        [[tp, fp],
         [fn, tn]],
        index=['Predicted 1', 'Predicted 0'],
        columns=['Actual 1', 'Actual 0']
    )
    return cm_custom

# 4Ô∏è‚É£ Evaluate model performance
acc_train = accuracy_score(y_train, predictions_train)
acc_test = accuracy_score(y_test, predictions_test)

print(f"üîπ Train Accuracy: {acc_train:.4f}")
print(f"üîπ Test Accuracy:  {acc_test:.4f}")

print("\n=== Train Classification Report ===")
print(classification_report(y_train, predictions_train))

print("\nTrain Confusion Matrix:")
print(custom_cm(y_train, predictions_train))

print("\n=== Test Classification Report ===")
print(classification_report(y_test, predictions_test))

print("\nTest Confusion Matrix:")
print(custom_cm(y_test, predictions_test))

üîπ Train Accuracy: 0.9053
üîπ Test Accuracy:  0.8673

=== Train Classification Report ===
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      5547
           1       0.92      0.60      0.72      1453

    accuracy                           0.91      7000
   macro avg       0.91      0.79      0.83      7000
weighted avg       0.91      0.91      0.90      7000


Train Confusion Matrix:
             Actual 1  Actual 0
Predicted 1       869        79
Predicted 0       584      5468

=== Test Classification Report ===
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      2416
           1       0.77      0.46      0.57       584

    accuracy                           0.87      3000
   macro avg       0.82      0.71      0.75      3000
weighted avg       0.86      0.87      0.85      3000


Test Confusion Matrix:
             Actual 1  Actual 0
Predicted 1       266        80
Predicte

#Imbalanced dataset

mbalanced data means that in a dataset, one class is much more or much less frequent compared to the other. In other words, the classes in the target variable are not evenly distributed.

This can lead to a model that fails to correctly predict the minority class.

Result: high accuracy, but a useless model.

üí° Therefore, in imbalanced datasets, accuracy alone is not enough.

Problems caused by imbalanced data:

Biased model: The model learns the dominant class and fails to recognize the minority class correctly.

Poor recall / F1-score: It becomes difficult to correctly predict the minority class.

Critical class may be missed: In important fields like healthcare or fraud detection, failing to predict the minority class can cause serious problems.

Rule of thumb: If one class has >70‚Äì80% of the observations, the dataset may be imbalanced. The occurrence of class 1 (or minority class) is considered a rare event.


In [9]:
#Check
y.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Exited,Unnamed: 1_level_1
0,0.7963
1,0.2037


#Solutions ‚Äî A) Data-Level B) Algorithm-Level

A) Resampling (Data-Level Solutions)

Oversampling ‚Üí Increase the minority class (e.g., SMOTE, ADASYN)

Undersampling ‚Üí Reduce the dominant class

Synthetic data generation

SMOTE: Creates synthetic examples for the minority class

B) Algorithm-Level Solutions

If the imbalance is very severe, recall may still remain low (e.g., scale_pos_weight=4).

Set class weights

In sklearn models: class_weight='balanced'

Gives more importance to the minority class ‚Üí increases recall, but may decrease accuracy.

Problem: The model learns the dominant class and fails to predict the minority class correctly.
Solution: Use resampling, class weights, or specialized evaluation metrics.

#Xgboost
##It‚Äôs an optimized implementation of Gradient Boosting, designed for speed and performance.
###It‚Äôs widely used in Kaggle competitions, industry projects, and research, because it delivers:
###High accuracy
###Fast training speed
###Good handling of missing data and outliers
###Built-in regularization to prevent overfitting

###XGBoost stands for Extreme Gradient Boosting.
###Boosting = building a strong model by combining many weak models (usually decision trees).
###Each new tree is trained to fix the errors made by the previous ones.
###Gradient Boosting specifically uses gradient descent to minimize errors.
#XGBoost builds trees step-by-step, and at each step:
###It looks at where the model makes mistakes.
###It builds a new tree to correct those mistakes.
###It combines all trees for the final prediction.

Let‚Äôs say you‚Äôre predicting house prices.

Start with a simple model (like predicting the average price).

Calculate errors (difference between predicted and actual prices).

Build a small decision tree that predicts those errors.

Add the new tree‚Äôs predictions to improve the old model.

Repeat steps 2‚Äì4 many times (each tree fixes the previous model‚Äôs mistakes).

Combine all trees for the final result.

This is ‚Äúboosting.‚Äù

ü™µ Random Forest = Many independent trees that vote together ‚Üí stable, simple, and fast.

‚ö° XGBoost = Many dependent trees that learn from each other‚Äôs mistakes ‚Üí more accurate, but more complex.

Random Forest = ‚ÄúA classroom of students answering the same question independently, then taking a majority vote.‚Äù

XGBoost = ‚ÄúA classroom where each student learns from the previous one‚Äôs mistakes to improve the final answer.‚Äù

In Random Forest, trees are independent
IN XGboost, trees are dependent

XGBoost adds learning_rate, regularization (L1/L2), and gamma, which don‚Äôt exist in Random Forest.

XGBOOST ‚úÖ Handles missing values automatically. During training, it learns the best direction (left or right) to take when a feature is missing.
Less robust to outliers, because boosting focuses on correcting errors, and outliers create large errors that can distort learning
RANDOM FOREST ‚ùå Does not handle missing values automatically. You must fill or impute them before training (e.g., with mean, median, or mode).
Fairly robust to outliers, because Random Forest averages many trees ‚Üí single outlier doesn‚Äôt influence much.


###üîπ Random Forest avoids overfitting through randomness and averaging.
###üîπ XGBoost can overfit because it learns sequentially, but offers strong regularization tools to control it.

# Hyperparameters of Xgboost

max_depth ‚Üí Maximum depth of each decision tree (how many splits a tree can make).
üîπ Higher = more complex model ‚Üí risk of overfitting.
üîπ Lower = simpler model ‚Üí might underfit.
üîπ Usually between 3‚Äì10.
üîπ Start around 5‚Äì6.

learning_rate ‚Üí Controls how much each new tree contributes to the model. This is one of the most important XGBoost parameters ‚Äî sometimes it determines 70% of model performance.

Small ‚Üí slower learning but more accurate.

Large ‚Üí faster learning but riskier.
üîπ Usually between 0.01‚Äì0.3.
üîπ Start with 0.1; reduce if overfitting occurs.

0.05‚Äì0.1 ‚Üí most stable and accurate models

0.01 ‚Üí for very large datasets

0.2‚Äì0.3 ‚Üí faster results, medium accuracy

0.3 ‚Üí generally not recommended

n_estimators ‚Üí Number of boosting rounds (trees).
üîπ More trees = better performance (up to a limit).
üîπ Too many trees ‚Üí longer training, overfitting risk (if learning_rate is high).
üîπ Usually 100‚Äì1000.
üîπ Use early stopping to find the optimal number.

subsample ‚Üí Percentage of observations (rows) used for each tree.

Each tree sees a different subset ‚Üí more robust model.

1.0 ‚Üí 100% of data

0.8 ‚Üí 80% of data

0.5 ‚Üí 50% of data
üîπ Adds randomness.
üîπ Lower = reduces overfitting.
üîπ Too low = underfitting.
üîπ Usually 0.5‚Äì1.0, 0.8 is a good start.

Example: subsample = 0.5, n_estimators = 1000:

Tree 1 ‚Üí randomly selects 500 rows

Tree 2 ‚Üí another random 500 rows

Tree 3 ‚Üí another random 500 rows
‚û° Each tree sees different points ‚Üí model doesn‚Äôt memorize points ‚Üí overall performance improves

colsample_bytree ‚Üí Percentage of features (columns) used for each tree.
üîπ Reduces correlation between trees.
üîπ Lower = reduces overfitting.
üîπ Too low = underfitting.
üîπ Usually 0.5‚Äì1.0, often 0.8.

reg_alpha (L1 penalty) ‚Üí Higher = stronger regularization ‚Üí reduces overfitting.

Encourages sparsity (some features ignored).

Range: 0‚Äì5

reg_lambda (L2 penalty) ‚Üí Penalizes large weights but does not make them zero.

Range: 1‚Äì10

Regularization = mechanism to prevent overfitting.

If model is too complex ‚Üí overfitting occurs

Regularization ‚Üí adds a ‚Äúpenalty‚Äù ‚Üí simplifies model

In XGBoost, two ways to regularize:

L2 penalty ‚Üí reg_lambda ‚Üí makes model more ‚Äúcautious‚Äù, reduces overfitting

1 ‚Üí normal regularization

5 ‚Üí stronger penalty ‚Üí simpler model

0 ‚Üí no penalty ‚Üí risky, overfitting possible

L1 penalty ‚Üí reg_alpha ‚Üí can zero out unnecessary leaves, simplifies model

Useful for datasets with many features

Analogy:

L1 (alpha) ‚Üí ‚Äúcut some‚Äù

L2 (lambda) ‚Üí ‚Äúsoften all a bit‚Äù

Defaults: reg_alpha = 0, reg_lambda = 1 (sufficient for small datasets)
For large datasets with many features ‚Üí try reg_alpha = 0.1‚Äì1, reg_lambda = 1‚Äì5

SMOTE / Oversampling:

If recall is still critical, it is possible to artificially increase the minority class.

Combining scale_pos_weight with SMOTE is often very effective.

###SMOTE - Synthetic Minority Oversampling Technique

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import pandas as pd
from collections import Counter

# üéØ Define features (X) and target (y)
X = df.drop(columns=['Exited'])
y = df['Exited']

# 1Ô∏è‚É£ Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y  # stratify=y ensures class distribution is preserved
)

# 2Ô∏è‚É£ Increase minority class using SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print("Original class distribution:", Counter(y_train))
print("Class distribution after SMOTE:", Counter(y_res))

# 3Ô∏è‚É£ Calculate scale_pos_weight
neg, pos = Counter(y_res)[0], Counter(y_res)[1]
scale_pos_weight = neg / pos
print(f"\nscale_pos_weight = {scale_pos_weight:.2f}\n")

# 4Ô∏è‚É£ Create and fit XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=2,
    scale_pos_weight=4,   # weight for imbalanced classes
    random_state=42,
)

xgb_model.fit(X_res, y_res)

# 5Ô∏è‚É£ Predictions
pred_train = xgb_model.predict(X_res)
pred_test = xgb_model.predict(X_test)

# Accuracy
acc_train = accuracy_score(y_res, pred_train)
acc_test = accuracy_score(y_test, pred_test)

print(f"üîπ Train accuracy: {acc_train:.4f}")
print(f"üîπ Test accuracy:  {acc_test:.4f}\n")

# 6Ô∏è‚É£ Confusion matrices: Predicted on top, Actual on left
def confusion_matrix_custom(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    cm_df = pd.DataFrame([[tp, fp],
                          [fn, tn]],
                         index=["Predicted 1", "Predicted 0"],
                         columns=["Actual 1", "Actual 0"])

    print("Confusion Matrix (Predicted on top, Actual on left):\n", cm_df)
    print(f"\nTP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}\n")

    return cm_df

# For Train set
print("=== For Train set ===")
print(classification_report(y_res, pred_train))
confusion_matrix_custom(y_res, pred_train)

# For Test set
print("=== For Test set ===")
print(classification_report(y_test, pred_test))
confusion_matrix_custom(y_test, pred_test)

Original class distribution: Counter({0: 5574, 1: 1426})
Class distribution after SMOTE: Counter({0: 5574, 1: 5574})

scale_pos_weight = 1.00

üîπ Train accuracy: 0.8175
üîπ Test accuracy:  0.6790

=== For Train set ===
              precision    recall  f1-score   support

           0       0.96      0.66      0.78      5574
           1       0.74      0.97      0.84      5574

    accuracy                           0.82     11148
   macro avg       0.85      0.82      0.81     11148
weighted avg       0.85      0.82      0.81     11148

Confusion Matrix (Predicted on top, Actual on left):
              Actual 1  Actual 0
Predicted 1      5422      1882
Predicted 0       152      3692

TP: 5422, TN: 3692, FP: 1882, FN: 152

=== For Test set ===
              precision    recall  f1-score   support

           0       0.95      0.63      0.76      2389
           1       0.38      0.87      0.53       611

    accuracy                           0.68      3000
   macro avg       0.6

Unnamed: 0,Actual 1,Actual 0
Predicted 1,533,885
Predicted 0,78,1504


# Cross Validation

Cross-Validation is a technique where the dataset is split into multiple parts and the model is tested several times on different combinations instead of just one train/test split.
It helps evaluate the model more objectively and accurately.

A single 70/30 or 80/20 split may be random and not reliable.
CV tests the model across multiple splits and reduces this randomness.

The data is divided into K parts (e.g., K=5).
Each time, 1 fold is used for testing and the remaining folds for training.
The model is trained K times and the results are averaged.

This reduces overfitting, evaluates the model more accurately, and helps select the best model.

##How does Cross-Validation reduce overfitting?

###It does not rely on a single train/test split
With a single 70/30 split, the model might accidentally perform well on an easy test subset.

Cross-validation:

splits the data into multiple parts,

evaluates the model on different test folds,

averages the results.

‚û°Ô∏è The model cannot ‚Äúmemorize‚Äù one specific split.
‚û°Ô∏è This reduces overfitting.

###If a model truly generalizes well ‚Üí it performs similarly across folds.
###If it's overfitting ‚Üí validation scores swing up and down.

In [12]:
from IPython.display import Image

# Display an image from the web
url = "https://zitaoshen.rbind.io/project/machine_learning/machine-learning-101-cross-vaildation/featured.png"
display(Image(url=url))


In [15]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import pandas as pd
from collections import Counter

# üéØ Define features (X) and target (y)
X = df.drop(columns=['Exited'])
y = df['Exited']

# 1Ô∏è‚É£ Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 2Ô∏è‚É£ Increase minority class using SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print("Original class distribution:", Counter(y_train))
print("Class distribution after SMOTE:", Counter(y_res))

# 3Ô∏è‚É£ Calculate scale_pos_weight
neg, pos = Counter(y_res)[0], Counter(y_res)[1]
scale_pos_weight = neg / pos
print(f"\nscale_pos_weight = {scale_pos_weight:.2f}\n")

# 4Ô∏è‚É£ Create and fit XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=2,
    scale_pos_weight=4,
    random_state=42,
)

xgb_model.fit(X_res, y_res)

# 5Ô∏è‚É£ Predictions
pred_train = xgb_model.predict(X_res)
pred_test = xgb_model.predict(X_test)

# Accuracy
acc_train = accuracy_score(y_res, pred_train)
acc_test = accuracy_score(y_test, pred_test)

print(f"üîπ Train accuracy: {acc_train:.4f}")
print(f"üîπ Test accuracy:  {acc_test:.4f}\n")

# 6Ô∏è‚É£ Confusion matrices: Predicted on top, Actual on left
def confusion_matrix_custom(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    cm_df = pd.DataFrame([[tp, fp],
                          [fn, tn]],
                         index=["Predicted 1", "Predicted 0"],
                         columns=["Actual 1", "Actual 0"])

    print("Confusion Matrix (Predicted on top, Actual on left):\n", cm_df)
    print(f"\nTP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}\n")

    return cm_df

# For Train set
print("=== For Train set ===")
print(classification_report(y_res, pred_train))
confusion_matrix_custom(y_res, pred_train)

# For Test set
print("=== For Test set ===")
print(classification_report(y_test, pred_test))
confusion_matrix_custom(y_test, pred_test)

# 7Ô∏è‚É£ ‚≠ê ADDING 5-FOLD CROSS VALIDATION ‚≠ê
cv = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    xgb_model,
    X_train,      # Use only original training data (imbalanced)
    y_train,
    cv=cv,
    scoring='accuracy'
)

print("\n==============================")
print("‚≠ê 5-FOLD CROSS VALIDATION ‚≠ê")
print("==============================")
print("5-Fold CV scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())
print("Std CV accuracy:", cv_scores.std())


Original class distribution: Counter({0: 5574, 1: 1426})
Class distribution after SMOTE: Counter({0: 5574, 1: 5574})

scale_pos_weight = 1.00

üîπ Train accuracy: 0.8175
üîπ Test accuracy:  0.6790

=== For Train set ===
              precision    recall  f1-score   support

           0       0.96      0.66      0.78      5574
           1       0.74      0.97      0.84      5574

    accuracy                           0.82     11148
   macro avg       0.85      0.82      0.81     11148
weighted avg       0.85      0.82      0.81     11148

Confusion Matrix (Predicted on top, Actual on left):
              Actual 1  Actual 0
Predicted 1      5422      1882
Predicted 0       152      3692

TP: 5422, TN: 3692, FP: 1882, FN: 152

=== For Test set ===
              precision    recall  f1-score   support

           0       0.95      0.63      0.76      2389
           1       0.38      0.87      0.53       611

    accuracy                           0.68      3000
   macro avg       0.6