In [89]:
import sys, os
# go 2 levels up from current folder (data_preprocess → src → project root)
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(project_root)

import pandas as pd
from src.data_ingestion.data_loader import load_data

## Data Preprocessing

### Data Cleaning

In [90]:
data = load_data("Churn_Modelling.csv")

Successfully loaded: c:\Users\krish\Desktop\Churn Analysis\data\raw\Churn_Modelling.csv


In [91]:
# creating a copy of original dataset for backup
df = data.copy()
print("Original Dataset size")
print("----------------")
print(data.shape)
print("Duplicate Dataset size")
print("----------------")
print(df.shape)

Original Dataset size
----------------
(10002, 14)
Duplicate Dataset size
----------------
(10002, 14)


In [92]:
# Removing useless columns

def drop_useless_cols(df, cols_to_drop=['CustomerId', 'Surname']):
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    return df

df = drop_useless_cols(df)
print(df.columns)
print(df.shape)

Index(['RowNumber', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure',
       'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited'],
      dtype='object')
(10002, 12)


In [93]:
# Handling missing values

print("no.of missing values:",df.isnull().sum().sum())
def fill_missing_values(df):
    # Fill Age with mean
    if 'Age' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Columns to fill with mode
    cols_mode = ['Geography', 'HasCrCard', 'IsActiveMember']
    for col in cols_mode:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])
    
    return df
df = fill_missing_values(df)
print("after hadling:",df.isnull().sum().sum())


no.of missing values: 4
after hadling: 0


In [94]:
# Handling Duplicates

def remove_duplicates(df):
    print("No. of duplicates:", df.duplicated().sum())
    df = df.drop_duplicates()
    print("After deleting duplicates:", df.duplicated().sum())
    return df
df = remove_duplicates(df)

No. of duplicates: 2
After deleting duplicates: 0


### Feature Engineering

In [95]:
from sklearn.preprocessing import LabelEncoder

def label_encode_columns(df, columns=['Geography', 'Gender']):
    le = LabelEncoder()
    for col in columns:
        if col in df.columns:
            df[col] = le.fit_transform(df[col])
    return df
df = label_encode_columns(df)

In [96]:
# Feature Sclaing

from sklearn.preprocessing import StandardScaler

def scale_features(df, columns=None):
    scaler = StandardScaler()
    if columns is None:
        columns = df.select_dtypes(include='int64').columns
    df[columns] = scaler.fit_transform(df[columns])
    return df
df = scale_features(df)

## Model Building

### Data Splitting

In [97]:
from sklearn.model_selection import train_test_split

def split_data(df, target_col='Exited', test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    y = y.astype(int)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split_data(df, target_col='Exited')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(8000, 11) (2000, 11) (8000,) (2000,)


#### Random Forest

In [98]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

rf = RandomForestClassifier(random_state=42, class_weight="balanced")

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1 Score': f1_score(y_test, y_pred)
}

metrics_df = pd.DataFrame([metrics])
print("\nClassification Metrics:\n", metrics_df)

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Pred 0', 'Pred 1'])
print("\nConfusion Matrix:\n", cm_df)

cv_results = pd.DataFrame(grid_search.cv_results_)[[
    'params', 'mean_test_score', 'std_test_score'
]].sort_values(by='mean_test_score', ascending=False)

print("\nCross-validation results:\n", cv_results.head())

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

Classification Metrics:
    Accuracy  Precision    Recall  F1 Score
0     0.848   0.639566  0.579853  0.608247

Confusion Matrix:
           Pred 0  Pred 1
Actual 0    1460     133
Actual 1     171     236

Cross-validation results:
                                                params  mean_test_score  \
25  {'max_depth': None, 'max_features': 'sqrt', 'm...         0.612538   
52  {'max_depth': None, 'max_features': 'log2', 'm...         0.612538   
70  {'max_depth': 10, 'max_features': 'sqrt', 'min...         0.612044   
97  {'max_depth': 10, 'max_features': 'log2', 'min...         0.612044   
75  {'max_depth': 10, 'max_features': 'sqrt', 'min...         0.611967   

    std_test_score  
25        0.020558  
52        0.020558  
70        0.020671  
97        0.020671  
75        0.018046  


In [100]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


scale_pos_weight = (y_train.value_counts()[0] / y_train.value_counts()[1])

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",   # optimize for class separation
    use_label_encoder=False,
    random_state=42,
    scale_pos_weight=scale_pos_weight,
    tree_method="hist"
)

param_grid = {
    'n_estimators': [300, 500],
    'max_depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],             # min loss reduction (regularization)
    'reg_alpha': [0, 0.5],       # L1 regularization
    'reg_lambda': [1, 5]         # L2 regularization
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)

metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1 Score': f1_score(y_test, y_pred)
}

metrics_df = pd.DataFrame([metrics])
print("\nClassification Metrics:\n", metrics_df)

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Pred 0', 'Pred 1'])
print("\nConfusion Matrix:\n", cm_df)


Fitting 5 folds for each of 256 candidates, totalling 1280 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 1, 'subsample': 1.0}

Classification Metrics:
    Accuracy  Precision    Recall  F1 Score
0     0.803   0.511384  0.717445  0.597137

Confusion Matrix:
           Pred 0  Pred 1
Actual 0    1314     279
Actual 1     115     292
