In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Import the dataset
file_path = "C:\\Users\\MUBASHIR KHAN\\Desktop\\jupyter\\DMV\\WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(file_path)

# 2. Explore the dataset
print("Dataset Head:\n", df.head())
print("\nDataset Info:\n", df.info())
print("\nSummary Statistics:\n", df.describe())

# 3. Handle missing values
# Checking for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Filling missing values or dropping
# Example: If missing values are found in a column, fill with median or drop rows/columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

# 4. Remove duplicate records
df.drop_duplicates(inplace=True)

# 5. Check for inconsistent data and standardize
# Example: Standardizing 'TotalCharges' as it may contain spaces and need to be numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Rechecking missing values after conversion
print("\nMissing Values after conversion:\n", df.isnull().sum())
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# 6. Convert columns to the correct data types
# Example: Converting 'SeniorCitizen' from integer to boolean
df['SeniorCitizen'] = df['SeniorCitizen'].astype(bool)

# 7. Identify and handle outliers
# Example: Using IQR to handle outliers in 'tenure' column
Q1 = df['tenure'].quantile(0.25)
Q3 = df['tenure'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['tenure'] >= lower_bound) & (df['tenure'] <= upper_bound)]

# 8. Perform feature engineering
# Example: Creating 'TotalServices' as the count of all services used by a customer
services = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
            'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
df['TotalServices'] = df[services].apply(lambda x: x.eq('Yes').sum(), axis=1)

# 9. Normalize or scale the data if necessary
# Example: Scaling numerical features
scaler = StandardScaler()
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'TotalServices']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# 10. Split the dataset into training and testing sets
X = df.drop(columns=['Churn'])
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)  # Assuming 'Churn' is the target column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 11. Export the cleaned dataset for future analysis or modeling
cleaned_file_path = "C:\\Users\\MUBASHIR KHAN\\Desktop\\jupyter\\DMV\\Cleaned_Telco_Customer_Churn.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Dataset Head:
    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies      