In [3]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
# --- Step 1: Load Dataset ---
data = pd.read_csv("Telecom_Customer_Churn.csv")

# --- Step 2: Explore Dataset ---
print("Shape:", data.shape)
print("\nColumns:\n", data.columns)
print("\nMissing Values:\n", data.isnull().sum())
print("\nUnique Values per Column:\n", data.nunique())
print("\nSummary Stats:\n", data.describe())

# --- Step 3: Remove Duplicates ---
data = data.drop_duplicates()
print("\nAfter removing duplicates:", data.shape)

# --- Step 4: Handle Missing Values ---
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(exclude=[np.number]).columns

# Fill numeric missing values with median
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

# Fill categorical missing values with mode
for c in cat_cols:
    data[c] = data[c].fillna(data[c].mode()[0])

# --- Step 5: Fix Inconsistent Data & Data Types ---
for c in cat_cols:
    data[c] = data[c].astype(str).str.strip().str.lower()

if "TotalCharges" in data.columns:
    data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")
    data["TotalCharges"].fillna(data["TotalCharges"].median(), inplace=True)


Shape: (7043, 21)

Columns:
 Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

Missing Values:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Unique Values per Column:
 customerID          7043
gender                 2
SeniorCitizen      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["TotalCharges"].fillna(data["TotalCharges"].median(), inplace=True)


In [7]:
# --- Step 6: Handle Outliers (IQR Method) ---
for c in num_cols:
    Q1 = data[c].quantile(0.25)
    Q3 = data[c].quantile(0.75)
    IQR = Q3 - Q1
    low, high = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    data[c] = data[c].clip(low, high)

# --- Step 7: Feature Engineering ---
if {"tenure", "TotalCharges"}.issubset(data.columns):
    data["avg_monthly_spend"] = (data["TotalCharges"] / (data["tenure"].replace(0, np.nan))).fillna(0)

if "tenure" in data.columns:
    data["tenure_group"] = pd.cut(data["tenure"],
                                  bins=[0, 12, 24, 48, 72],
                                  labels=["0-1yr", "1-2yr", "2-4yr", "4-6yr"])

# --- Step 8: Prepare Target Variable ---
if "Churn" in data.columns:
    data["Churn"] = data["Churn"].str.strip().str.lower().map({"yes": 1, "no": 0})

# --- Step 9: Encoding & Scaling ---
X = pd.get_dummies(data.drop("Churn", axis=1), drop_first=True)
y = data["Churn"]

scaler = StandardScaler()
X[X.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(X.select_dtypes(include=[np.number]))

# --- Step 10: Split into Train and Test ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("\nTrain Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

# --- Step 11: Export Cleaned Data ---
data.to_csv("Cleaned_Telecom_Customer_Churn.csv", index=False)
print("\n✅ Cleaned dataset saved as 'Cleaned_Telecom_Customer_Churn.csv'")



Train Shape: (5634, 7076)
Test Shape: (1409, 7076)

✅ Cleaned dataset saved as 'Cleaned_Telecom_Customer_Churn.csv'
