In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Load the dataset
data_path = "../data/raw/Telco-Customer-Churn.csv"
df = pd.read_csv(data_path)

In [None]:
# Display the first few rows of the dataset
df.head()

In [None]:
# Display the shape of the dataset checking for missing values
df.info()

In [None]:
# Confirm 'TotalCharges' is already numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
# Fill missing 'TotalCharges' with median (if any)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

In [None]:
# Display data types to ensure correctness
print("Data types after conversion:")
print(df.dtypes)

In [None]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                       'PaperlessBilling', 'PaymentMethod', 'Churn']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [None]:
# Standardize numerical columns
numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [None]:
# Save the processed data
processed_data_path = "../data/processed/cleaned_data.csv"
df.to_csv(processed_data_path, index=False)
print("\nData preprocessing completed and saved to cleaned_data.csv")