In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# -------------------- 1. Load & Explore the Dataset --------------------

# Load dataset
df = pd.read_csv("Telco-Customer-Churn.csv")  

# Display first few rows
print("\nDataset Preview:\n", df.head())

# Check dataset structure
print("\nDataset Info:\n")
print(df.info())

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Summary statistics
print("\nDataset Summary:\n", df.describe())

# -------------------- 2. Exploratory Data Analysis (EDA) --------------------

# Visualize churn distribution
sns.countplot(data=df, x='Churn')
plt.title("Churn Distribution")
plt.show()

# Churn vs Subscription Plan
sns.countplot(data=df, x='Subscription Plan', hue='Churn')
plt.title("Churn Rate by Subscription Plan")
plt.show()

# Monthly Charges vs Churn
sns.boxplot(data=df, x='Churn', y='Monthly Charges')
plt.title("Monthly Charges vs Churn")
plt.show()

# Total Usage Hours vs Churn
sns.boxplot(data=df, x='Churn', y='Total Usage Hours')
plt.title("Total Usage Hours vs Churn")
plt.show()

# ---------------- 3. Data Cleaning & Preprocessing ----------------

# Drop missing values
df = df.dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])  # Male=1, Female=0
df['Subscription Plan'] = label_encoder.fit_transform(df['Subscription Plan'])  # Basic=0, Standard=1, Premium=2
df['Churn'] = label_encoder.fit_transform(df['Churn'])  # Yes=1, No=0

# Normalize numerical features
scaler = StandardScaler()
df[['Monthly Charges', 'Total Usage Hours']] = scaler.fit_transform(df[['Monthly Charges', 'Total Usage Hours']])

# Define feature matrix (X) and target variable (y)
X = df.drop(['Customer ID', 'Churn'], axis=1)
y = df['Churn']

# Split dataset into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------- 4. Model Training --------------------

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# -------------------- 5. Model Evaluation --------------------

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Accuracy Score
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

# -------------------- 6. Feature Importance --------------------

# Importance of features
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("\nFeature Importances:\n", feature_importances)

# Visualize feature importances
sns.barplot(x=feature_importances['Importance'], y=feature_importances['Feature'])
plt.title("Feature Importance for Churn Prediction")
plt.show()

# -------------------- 7. Insights & Recommendations --------------------

print("\n🔹 Insights & Recommendations:")
print("1️⃣ Customers with **low Total Usage Hours** are more likely to churn. Encourage engagement with personalized content.")
print("2️⃣ **Basic plan customers** show higher churn rates. Consider offering discounts or premium trials.")
print("3️⃣ Customers with **higher support tickets** tend to churn. Improve customer service.")
print("4️⃣ **High monthly charges** customers are also churning. Introduce loyalty rewards to retain them.")
