In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Convert TotalCharges to numeric (fixing object dtype issue)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Fill missing values
df.fillna(0, inplace=True)

# Create Customer Lifetime Value (CLV) before splitting data
df["CLV"] = df["MonthlyCharges"] * df["tenure"]

# Temporarily remove 'Churn' column before encoding
churn_column = df["Churn"]
df = df.drop(columns=["Churn"])

# One-Hot Encode categorical features efficiently
df = pd.get_dummies(df, drop_first=True)

# Ensure all expected features exist
expected_features = set(df.columns)
df = pd.concat([df[col] if col in df.columns else pd.Series(0, index=df.index) for col in expected_features], axis=1)

# Add 'Churn' column back to the DataFrame
df["Churn"] = churn_column

# Separate features & target
X = df.drop(columns=["Churn"])
y = df["Churn"].apply(lambda x: 1 if x == "Yes" else 0)

# Split data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
numerical_features = ["tenure", "MonthlyCharges", "TotalCharges", "CLV"]
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Ensure DataFrame is defragmented before saving
X_train = X_train.copy()

# Train the model (Random Forest)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# 🔥 Save Feature Importance for Insights
feature_importance = rf_model.feature_importances_
importance_df = pd.DataFrame({"Feature": X_train.columns, "Importance": feature_importance})
importance_df = importance_df.sort_values(by="Importance", ascending=False)

# Save Model, Features & Scaler
with open("Best_Model_Forest_new.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open("model_features.pkl", "wb") as f:
    pickle.dump(X_train.columns.tolist(), f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# ✅ Save Feature Importance Separately
with open("feature_importance.pkl", "wb") as f:
    pickle.dump(importance_df, f)

print("✅ Model, scaler, features, and feature importance saved successfully!")


✅ Model, scaler, features, and feature importance saved successfully!
