# Telco Customer Churn Prediction

This notebook reproduces the EDA, preprocessing, model training, and evaluation used to predict customer churn.

Files in the project folder include trained models and preprocessing pipeline saved with `joblib`.


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
import joblib
import matplotlib.pyplot as plt

DATA_PATH = 'Telco-Customer-Churn-cleaned.csv'
df = pd.read_csv(DATA_PATH)
df.head()


In [None]:

# Basic preprocessing in this notebook cell
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].replace(' ', np.nan))
df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure']).fillna(df['MonthlyCharges'])
if 'customerID' in df.columns:
    df = df.drop('customerID', axis=1)
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])
X = df.drop('Churn', axis=1)
y = df['Churn']

numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, numeric_features),('cat', cat_transformer, categorical_features)], remainder='drop', sparse_threshold=0)

X_trans = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.2, random_state=42, stratify=y)

print('Shapes:', X_train.shape, X_test.shape)


In [None]:

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('RandomForest accuracy:', accuracy_score(y_test, y_pred))

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
print('LogisticRegression accuracy:', accuracy_score(y_test, lr.predict(X_test)))

print('\nRandomForest classification report:')
print(classification_report(y_test, y_pred))


In [None]:

# Feature importance (RandomForest)
try:
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    cat_ohe_names = ohe.get_feature_names_out(categorical_features).tolist()
    feature_names = numeric_features + cat_ohe_names
except Exception:
    feature_names = numeric_features

importances = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False)
importances.head(20)

# Save models and preprocessor
joblib.dump(rf, 'rf_churn_model.joblib')
joblib.dump(lr, 'lr_churn_model.joblib')
joblib.dump(preprocessor, 'preprocessor.joblib')
joblib.dump(le, 'label_encoder_churn.joblib')
print('Saved models to current directory')


In [None]:

# Example: load model and preprocessor, predict on first 5 rows
rf2 = joblib.load('rf_churn_model.joblib')
prep2 = joblib.load('preprocessor.joblib')
le2 = joblib.load('label_encoder_churn.joblib')
X_sample = df.drop('Churn', axis=1).iloc[:5]
Xp = prep2.transform(X_sample)
probs = rf2.predict_proba(Xp)[:,1]
preds = rf2.predict(Xp)
print('probs:', probs)
print('preds (0=no,1=yes):', preds)
