In [1]:
import pandas as pd
import numpy as np

# Load cleaned dataset from Day 4
df = pd.read_csv("../data/customer_churn_cleaned.csv")

print("\n--- DATA BEFORE OUTLIER HANDLING ---")
print(df.describe())

# Select numeric columns
numeric_columns = df.select_dtypes(include=["int64", "float64"]).columns

# IQR method to cap outliers
for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    df[col] = np.where(df[col] < lower_limit, lower_limit, df[col])
    df[col] = np.where(df[col] > upper_limit, upper_limit, df[col])

print("\n--- DATA AFTER OUTLIER HANDLING ---")
print(df.describe())

# Save outlier-handled dataset
df.to_csv("../data/customer_churn_no_outliers.csv", index=False)

print("\nDAY 5 COMPLETED: Outliers handled successfully")



--- DATA BEFORE OUTLIER HANDLING ---
       SeniorCitizen       tenure  MonthlyCharges  TotalCharges
count    7043.000000  7043.000000     7043.000000   7043.000000
mean        0.162147    32.371149       64.761692   2281.916928
std         0.368612    24.559481       30.090047   2265.270398
min         0.000000     0.000000       18.250000     18.800000
25%         0.000000     9.000000       35.500000    402.225000
50%         0.000000    29.000000       70.350000   1397.475000
75%         0.000000    55.000000       89.850000   3786.600000
max         1.000000    72.000000      118.750000   8684.800000

--- DATA AFTER OUTLIER HANDLING ---
       SeniorCitizen       tenure  MonthlyCharges  TotalCharges
count         7043.0  7043.000000     7043.000000   7043.000000
mean             0.0    32.371149       64.761692   2281.916928
std              0.0    24.559481       30.090047   2265.270398
min              0.0     0.000000       18.250000     18.800000
25%              0.0     9.00