In [1]:
import sys, os
# go 2 levels up from current folder (data_preprocess → src → project root)
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(project_root)

import pandas as pd
from src.data_ingestion.data_loader import load_data

## Data Preprocessing

### Data Cleaning

In [2]:
data = load_data("Churn_Modelling.csv")

Successfully loaded: c:\Users\krish\Desktop\Churn Analysis\data\raw\Churn_Modelling.csv


In [3]:
# creating a copy of original dataset for backup
df = data.copy()
print("Original Dataset size")
print("----------------")
print(data.shape)
print("Duplicate Dataset size")
print("----------------")
print(df.shape)

Original Dataset size
----------------
(10002, 14)
Duplicate Dataset size
----------------
(10002, 14)


In [4]:
# Removing useless columns

def drop_useless_cols(df, cols_to_drop=['CustomerId', 'Surname']):
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    return df

df = drop_useless_cols(df)
print(df.columns)
print(df.shape)

Index(['RowNumber', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure',
       'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited'],
      dtype='object')
(10002, 12)


In [5]:
# Handling missing values

print("no.of missing values:",df.isnull().sum().sum())
def fill_missing_values(df):
    # Fill Age with mean
    if 'Age' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Columns to fill with mode
    cols_mode = ['Geography', 'HasCrCard', 'IsActiveMember']
    for col in cols_mode:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])
    
    return df
df = fill_missing_values(df)
print("after hadling:",df.isnull().sum().sum())


no.of missing values: 4
after hadling: 0


In [6]:
# Handling Duplicates

def remove_duplicates(df):
    print("No. of duplicates:", df.duplicated().sum())
    df = df.drop_duplicates()
    print("After deleting duplicates:", df.duplicated().sum())
    return df
df = remove_duplicates(df)

No. of duplicates: 2
After deleting duplicates: 0


In [7]:
# convert float dtype columns into int64 

def convert_float_to_int(df):
    float_cols = df.select_dtypes(include='float').columns
    for col in float_cols:
        df[col] = df[col].astype('int64')
    return df
df = convert_float_to_int(df)
print(df.dtypes)

RowNumber           int64
CreditScore         int64
Geography          object
Gender             object
Age                 int64
Tenure              int64
Balance             int64
NumOfProducts       int64
HasCrCard           int64
IsActiveMember      int64
EstimatedSalary     int64
Exited              int64
dtype: object


### Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encode_columns(df, columns=['Geography', 'Gender']):
    le = LabelEncoder()
    for col in columns:
        if col in df.columns:
            df[col] = le.fit_transform(df[col])
    return df
df = label_encode_columns(df)

In [18]:
# Feature Sclaing

from sklearn.preprocessing import StandardScaler

def scale_features(df, columns=None):
    scaler = StandardScaler()
    if columns is None:
        columns = df.select_dtypes(include='int64').columns
    df[columns] = scaler.fit_transform(df[columns])
    return df
df = scale_features(df)

## Model Building

### Data Splitting

In [20]:
from sklearn.model_selection import train_test_split

def split_data(df, target_col='Exited', test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split_data(df, target_col='Exited')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(8000, 11) (2000, 11) (8000,) (2000,)
