In [2]:
# Telco Customer Churn Prediction

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

# Step 2: Load Data
data = pd.read_csv('../data/telco_churn.csv')
data.head()

# Step 3: Data Preprocessing
# Remove customerID
data.drop('customerID', axis=1, inplace=True)

# Convert TotalCharges to numeric
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Handle missing values
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
for column in data.select_dtypes(include='object').columns:
    data[column] = le.fit_transform(data[column])

# Step 4: Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop('Churn', axis=1))
X = pd.DataFrame(scaled_features, columns=data.columns[:-1])
y = data['Churn']

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Model Building
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Step 7: Model Evaluation
# Logistic Regression Evaluation
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# Random Forest Evaluation
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Step 8: Save the Best Model
# Assume Random Forest performed better
with open('../models/churn_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

# Step 9: Conclusion
# Logistic Regression is simpler but Random Forest offers higher accuracy and better recall for churned customers.
# Business can use this model to proactively target at-risk customers with offers and retention strategies.


ModuleNotFoundError: No module named 'pandas'