# Task 3 - Customer Churn Prediction (Telco Dataset)

This notebook prepares the Telco customer churn dataset, trains a RandomForest model, and reports metrics and feature importance.

## 1. Setup & Load

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load your CSV (adjust path as needed)
df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## 2. Cleaning & Encoding

In [2]:
df_telco = df.copy()

# Remove non-feature identifier
if 'customerID' in df_telco.columns:
    df_telco = df_telco.drop(columns=['customerID'])

# Ensure TotalCharges numeric
if 'TotalCharges' in df_telco.columns:
    df_telco['TotalCharges'] = pd.to_numeric(df_telco['TotalCharges'], errors='coerce')

# Drop rows with missing values created by coercion
df_telco = df_telco.dropna()

# Map target to binary
df_telco['Churn'] = df_telco['Churn'].map({'Yes':1,'No':0})

# One-hot encode categorical features
cat_cols = df_telco.select_dtypes(include='object').columns.tolist()
df_telco = pd.get_dummies(df_telco, columns=cat_cols, drop_first=True)

X = df_telco.drop(columns=['Churn'])
y = df_telco['Churn']
X.shape, y.mean()

((7032, 30), np.float64(0.26578498293515357))

## 3. Train/Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

## 4. Model Training & Evaluation

In [4]:
rf = RandomForestClassifier(n_estimators=400, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('Accuracy:', round(acc,4))
print('Confusion Matrix:\n', cm)
print('\nClassification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.7825
Confusion Matrix:
 [[1384  165]
 [ 294  267]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.89      0.86      1549
           1       0.62      0.48      0.54       561

    accuracy                           0.78      2110
   macro avg       0.72      0.68      0.70      2110
weighted avg       0.77      0.78      0.77      2110



## 5. Feature Importances

In [None]:
importances = (
    pd.DataFrame({'Feature': X.columns, 'Importance': rf.feature_importances_})
    .sort_values('Importance', ascending=False)
    .reset_index(drop=True)
)
importances.head(20)

## 6. Save Model (Optional)

In [None]:
import joblib
joblib.dump(rf, 'rf_telco_churn.joblib')