# **Description (English):**
This script handles the entire machine learning pipeline: data loading, preprocessing, feature engineering, training the customer churn prediction model, and then saving the trained model and data preprocessor (like ColumnTransformer) into .pkl files.
Important Note: You do NOT run this file on your deployment server! You run it once (or whenever you need to retrain the model) in a development environment like Google Colab. After successful execution, you must manually download the generated final_churn_predictor.pkl and data_preprocessor.pkl files and then upload them to your GitHub repository along with other project files.

# **الوصف (العربية):**
يتولى هذا السكريبت خط أنابيب تعلم الآلة بالكامل: تحميل البيانات، المعالجة المسبقة، هندسة الميزات، تدريب نموذج التنبؤ بمغادرة العملاء، ثم حفظ النموذج المُدرب ومعالج البيانات المسبق (مثل ColumnTransformer) في ملفات بصيغة .pkl.
ملاحظة هامة: أنت لا تقوم بتشغيل هذا الملف على خادم النشر الخاص بك! تقوم بتشغيله مرة واحدة فقط (أو كلما احتجت لإعادة تدريب النموذج) في بيئة تطوير مثل Google Colab. بعد التنفيذ الناجح، يجب عليك تنزيل ملفي final_churn_predictor.pkl و data_preprocessor.pkl اللذين تم إنشاؤهما يدويًا، ثم رفعهما إلى مستودع GitHub الخاص بك جنبًا إلى جنب مع ملفات المشروع الأخرى.

In [None]:
# -*- coding: utf-8 -*-
# train_and_save_model.py

# --- 0. Dependency Installation (Google Colab Specific) ---
# These commands should be run in your Colab environment before running the rest of the code.
# You typically don't include these in the final script uploaded to GitHub for deployment.
# !pip install catboost
# !pip install imbalanced-learn
# !pip install shap
# !pip install kagglehub[pandas-datasets]

# --- 1. Import Essential Libraries ---
# This section imports all the necessary Python libraries for data manipulation,
# visualization (though plots are usually omitted in deployment scripts),
# machine learning model building, and saving/loading assets.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # Used for visualization in Colab, can be removed for pure script
import seaborn as sns          # Used for visualization in Colab, can be removed for pure script
import joblib # Essential for saving and loading models
import kagglehub
from kagglehub import KaggleDatasetAdapter

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

from imblearn.over_sampling import SMOTE

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

import warnings
warnings.filterwarnings("ignore")

# Pandas display options for better visibility in development environments
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 100)

print("Libraries loaded successfully for training!")

# --- 2. Data Loading and Initial Preprocessing ---
# This part handles fetching the dataset, prioritizing KaggleHub, then GitHub, then local file.
# It also performs initial cleaning like handling missing 'TotalCharges' and converting 'Churn' to numeric.
df = None
file_path_telco = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
try:
    df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, "blastchar/telco-customer-churn", file_path_telco)
    print("Dataset loaded successfully using KaggleHub (Telco Customer Churn)!")
except Exception as e:
    print(f"Error loading Telco dataset with KaggleHub: {e}")
    github_data_url = 'https://raw.githubusercontent.com/IBM/telco-customer-churn-extra-data/master/WA_Fn-UseC_-Telco-Customer-Churn.csv'
    try:
        df = pd.read_csv(github_data_url)
        print("Dataset loaded successfully from GitHub URL!")
    except Exception as e:
        print(f"Error loading dataset from GitHub URL: {e}")
        try:
            df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
            print("Dataset loaded successfully from local file!")
        except FileNotFoundError:
            print("CRITICAL ERROR: 'WA_Fn-UseC_-Telco-Customer-Churn.csv' not found.")

if df is None:
    raise SystemExit("Dataset not loaded, stopping execution.")

print("\n--- Initial Data Snapshot ---")
print(df.head())
print("\n--- Data Info ---")
df.info()
print("\n--- Missing Values Check ---")
print(df.isnull().sum())

# Handle 'TotalCharges' missing values and type conversion
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
df.drop('customerID', axis=1, inplace=True) # Drop unique ID column
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0}) # Convert target variable to binary
print("\n'Churn' column converted to numeric.")

# --- 3. Exploratory Data Analysis (EDA) - Optional for Script ---
# This section contains plotting code for EDA. It's useful in a Jupyter/Colab notebook
# but can be omitted from a script that's only meant to train and save a model.
# Visualizations help understand data but are not needed for model's runtime.

# --- 4. Feature Engineering ---
# New, informative features are created from existing ones to enhance model performance.
# These engineered features must also be created in the API for new prediction requests.
print("\n--- Feature Engineering ---")
df['TotalServices'] = (df[['PhoneService', 'MultipleLines', 'InternetService',
                           'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                           'TechSupport', 'StreamingTV', 'StreamingMovies']] == 'Yes').sum(axis=1)
df['MonthlyChargePerTenure'] = df['MonthlyCharges'] / (df['tenure'] + 1e-6)
df['HasInternetService'] = df['InternetService'].apply(lambda x: 1 if x != 'No' else 0)
df['HasMultipleLines'] = df['MultipleLines'].apply(lambda x: 1 if x == 'Yes' else 0)
df['IsSeniorCitizen_Married'] = df.apply(lambda row: 1 if row['SeniorCitizen'] == 1 and row['Partner'] == 'Yes' else 0, axis=1)
print("\n--- Features after Engineering ---")
print(df[['tenure', 'MonthlyCharges', 'TotalCharges', 'TotalServices',
          'MonthlyChargePerTenure', 'HasInternetService', 'HasMultipleLines',
          'SeniorCitizen', 'Partner', 'IsSeniorCitizen_Married', 'Churn']].head())


# --- 5. Data Splitting and Imbalance Handling ---
# Data is split into training and testing sets. A ColumnTransformer is set up
# to preprocess (scale numerical, one-hot encode categorical) features.
# SMOTE is applied to the training data to balance the 'Churn' classes.
X = df.drop('Churn', axis=1)
y = df['Churn']

numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nOriginal X_train shape: {X_train.shape}")

# Fit and transform training data, only transform test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names after preprocessing for SHAP and correct DataFrame construction
ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
passthrough_features = [col for col in X.columns if col not in numeric_features + categorical_features] # Should be empty
all_feature_names = numeric_features + list(ohe_feature_names) + passthrough_features

X_train_processed_df = pd.DataFrame(X_train_processed, columns=all_feature_names, index=X_train.index)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=all_feature_names, index=X_test.index)

# Apply SMOTE to resample the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed_df, y_train)
print(f"\nResampled X_train shape: {X_train_resampled.shape}")

# --- 6. Build and Train the Model ---
# Here, a LightGBM model is trained. You can extend this to train multiple models
# (XGBoost, CatBoost) and select the best one based on evaluation metrics.
print("\n--- Training LightGBM Model for Saving ---")
lgb_model = lgb.LGBMClassifier(objective='binary', random_state=42, n_estimators=500, learning_rate=0.05, num_leaves=31)
lgb_model.fit(X_train_resampled, y_train_resampled)

# --- 7. Save the Model and Preprocessor ---
# This is the crucial step for deploying the model. The trained model and the preprocessor
# (which transforms raw input data into the format the model expects) are saved as .pkl files.
# These files will be loaded by the FastAPI application.
print("\n--- Saving Model and Preprocessor ---")
joblib.dump(lgb_model, 'final_churn_predictor.pkl')
joblib.dump(preprocessor, 'data_preprocessor.pkl')
print("Model and preprocessor saved successfully! Please download these .pkl files from Colab.")

print("\n'train_and_save_model.py' script finished. Remember to download .pkl files.")