In [1]:
import os

In [2]:
%pwd

'c:\\Users\\abdus samad\\Desktop\\Churn_ML\\Churn_ML_Project\\research'

In [4]:
os.chdir('..')
%pwd

'c:\\Users\\abdus samad\\Desktop\\Churn_ML\\Churn_ML_Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [7]:
from src.Churn_Predictor.constants import *
from src.Churn_Predictor.utils.common import read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )
        return data_transformation_config

In [18]:
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
from src.Churn_Predictor import logger

In [19]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    

    def initiate_data_transformation(self):
        logger.info("Reading data from csv file")
        df = pd.read_csv(self.config.data_path)


        logger.info("Dropping the 'customerID' column from the dataset")
        df.drop('customerID', axis=1, inplace=True)

        logger.info("Finding Null values in the dataset")
        null_counts = df.isnull().sum()
        logger.info(f"Null value counts:\n{null_counts}")

        logger.info("Converting 'TotalCharges' to numeric, coercing errors to NaN")
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        logger.info("Finding Null values in 'TotalCharges' after conversion")
        total_charges_null_count = df['TotalCharges'].isnull().sum()
        logger.info(f"Null value count in 'TotalCharges' after conversion: {total_charges_null_count}")

        logger.info("Filling mean_values in 'TotalCharges' Null Rows")
        mean_total_charges = df['TotalCharges'].mean()
        df['TotalCharges'].fillna(mean_total_charges, inplace=True)
        logger.info("Mean value filled in 'TotalCharges' Null Rows")

        logger.info("Finding the Duplicated values in the dataset")
        duplicated_count = df.duplicated().sum()
        logger.info(f"Duplicated value count: {duplicated_count}")

        logger.info("Dropping the Duplicated values in the dataset")
        df.drop_duplicates(keep='first', inplace=True)
        logger.info("Duplicated values dropped")

        return df
    
    def initiate_data_preprocessing(self, df):
        logger.info("Preparing data for preproceesing")
        df = df.replace({
            'No internet service': 'No',
            'No phone service': 'No'
            })
        
        binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 
               'PaperlessBilling', 'MultipleLines', 'OnlineSecurity', 
               'OnlineBackup', 'DeviceProtection', 'TechSupport', 
               'StreamingTV', 'StreamingMovies']
        
        multi_cols = ['InternetService', 'Contract', 'PaymentMethod']

        le_target = LabelEncoder()
        df['Churn'] = le_target.fit_transform(df['Churn'])

        for cols in binary_cols:
            le = LabelEncoder()
            df[cols] = le.fit_transform(df[cols])
        
        df = pd.get_dummies(data=df, columns=multi_cols, drop_first=True)
        logger.info("Data preprocessing completed")

        logger.info(f"Final info of the dataframe after preprocessing: {df.shape}")
        logger.info(f"Final columns of the dataframe after preprocessing: {df.columns.to_list()}")
        return df

    def initiate_train_test_split(self, df):
        logger.info("Initiating train test split")
        train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Churn'])
        logger.info("Train test split completed")

        train.to_csv(os.path.join(self.config.root_dir, 'train.csv'), index=False)
        test.to_csv(os.path.join(self.config.root_dir, 'test.csv'), index=False)

        logger.info(f"Train and test data saved in {self.config.root_dir}")
        logger.info(f"Train data shape: {train.shape}")
        logger.info(f"Test data shape: {test.shape}")
        

In [24]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    df = data_transformation.initiate_data_transformation()
    df = data_transformation.initiate_data_preprocessing(df)
    data_transformation.initiate_train_test_split(df)
except Exception as e:
    logger.exception(f"An error occurred during data transformation: {e}")

[2026-02-13 16:26:47,732: INFO: common: yaml file: config\config.yaml loaded successfully]
[2026-02-13 16:26:47,734: INFO: common: yaml file: params.yaml loaded successfully]
[2026-02-13 16:26:47,738: INFO: common: yaml file: schema.yaml loaded successfully]
[2026-02-13 16:26:47,739: INFO: common: created directory at: artifacts]
[2026-02-13 16:26:47,740: INFO: common: created directory at: artifacts/data_transformation]
[2026-02-13 16:26:47,741: INFO: 1367153455: Reading data from csv file]
[2026-02-13 16:26:47,781: INFO: 1367153455: Dropping the 'customerID' column from the dataset]
[2026-02-13 16:26:47,784: INFO: 1367153455: Finding Null values in the dataset]
[2026-02-13 16:26:47,791: INFO: 1367153455: Null value counts:
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
S

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(mean_total_charges, inplace=True)
