## Training model

In [23]:
import os
import joblib
import pandas as pd

# download data
df = pd.read_csv('C:/Users/hp/projects/Churn_Prediction_Project/data/telecome_churn.csv')

# clean 'TotalCharges' column
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna()

# preparation
X = df.drop("Churn", axis=1)
y = df["Churn"].map({'Yes': 1, 'No': 0})
X = pd.get_dummies(X, drop_first=True)

# حفظ الأعمدة
os.makedirs("models", exist_ok=True)
joblib.dump(X.columns, "models/model_columns.pkl")

['models/model_columns.pkl']

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "models/scaler.pkl")

['models/scaler.pkl']

In [None]:
# Import necessary libraries
import os
import joblib
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from scipy.stats import uniform, randint


# Step 1: Load and preprocess the data
# Load the dataset
df = pd.read_csv('C:/Users/hp/projects/Churn_Prediction_Project/data/telecome_churn.csv')

# Handle missing values and data types
# Convert 'TotalCharges' to numeric, replacing empty strings with 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)


df = df.drop(columns=['customerID'])
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Separate features (X) and target (y)
X = df.drop(columns=['Churn'])
y = df['Churn']

# Identify categorical and numerical columns for preprocessing
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Create a preprocessing pipeline for categorical and numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Apply preprocessing
X = preprocessor.fit_transform(X)
os.makedirs('models', exist_ok=True)
joblib.dump(preprocessor, 'models/preprocessor.pkl')
# Convert back to DataFrame for consistency
X = pd.DataFrame(X, columns=numerical_cols.tolist() + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)))

# UpSampling with SMOTEENN, ensuring random_state is set
sm = SMOTEENN(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

#  Split the data with a fixed random_state
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Define models and their hyperparameter grids 
models = [
    ('RandomForest', RandomForestClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200],
         'model__max_depth': [None, 10, 20]}),
    ('GradientBoosting', GradientBoostingClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200],
         'model__learning_rate': [0.05, 0.1, 0.5]}),
    ('SVM', SVC(random_state=42, class_weight='balanced', probability=True),
        {'model__C': [0.1, 1, 10],
         'model__gamma': ['scale', 'auto']}),
    ('LogisticRegression', LogisticRegression(random_state=42, class_weight='balanced'),
        {'model__C': [0.1, 1, 10],
         'model__penalty': ['l1', 'l2']}),
    ('DecisionTree', DecisionTreeClassifier(random_state=42),
        {'model__max_depth': [None, 10, 20],
         'model__min_samples_split': [2, 5, 10]}),
    ('AdaBoost', AdaBoostClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200],
         'model__learning_rate': [0.05, 0.1, 0.5]}),
    ('XGBoost', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        {'model__max_depth': randint(3, 6),
         'model__learning_rate': uniform(0.01, 0.2),
         'model__n_estimators': randint(100, 300),
         'model__subsample': uniform(0.8, 0.2)}),
    ('NaiveBayes', GaussianNB(), {})  # No hyperparameters for Naive Bayes
]

# Set up MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Churn Prediction")

# Define the training and logging function
best_model = None
best_accuracy = 0.0
best_model_name = None
model_scores = []  

def train_and_log_model(name, model, param_grid, X_train, X_test, y_train, y_test):
    global best_model, best_accuracy, best_model_name
    
    with mlflow.start_run(run_name=name) as run:
        # Create a pipeline with scaler and model
        pipeline = Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', model)
        ])

        # Hyperparameter tuning
        if name == 'XGBoost':
            random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid,
                                               n_iter=100, cv=3, verbose=0, random_state=42, n_jobs=-1)
            random_search.fit(X_train, y_train)
            best_pipeline = random_search.best_estimator_
        elif param_grid:
            grid_search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0)
            grid_search.fit(X_train, y_train)
            best_pipeline = grid_search.best_estimator_
        else:
            best_pipeline = pipeline
            best_pipeline.fit(X_train, y_train)

        # Make predictions
        preds = best_pipeline.predict(X_test)
        probs = best_pipeline.predict_proba(X_test)[:, 1] if hasattr(best_pipeline, "predict_proba") else None

        # Calculate metrics
        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds)
        rec = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        roc_auc = roc_auc_score(y_test, probs) if probs is not None else None

        # Log parameters and metrics to MLflow
        mlflow.log_param("model_name", name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1_score", f1)
        if roc_auc is not None:
            mlflow.log_metric("roc_auc", roc_auc)

        # Log the model
        mlflow.sklearn.log_model(best_pipeline, artifact_path="model")
        model_uri = f"runs:/{run.info.run_id}/model"
        mlflow.register_model(model_uri=model_uri, name=name)

        # Save the model locally
        joblib.dump(best_pipeline, f"models/{name}_model.pkl")

        # Store the scores
        model_scores.append({'Model': name, 'Accuracy': acc})

        # Print performance metrics in a format similar to the first code
        print(f"Model: {name}")
        print(f"Test Accuracy: {acc*100:.3f}%")
        print()

        # Check if the current model has the best accuracy
        if acc > best_accuracy:
            best_accuracy = acc
            best_model = best_pipeline
            best_model_name = name

# Train and log all models
for name, model, param_grid in models:
    train_and_log_model(name, model, param_grid, X_train, X_test, y_train, y_test)

# Display the best model 
print("Best Model:")
print(f"Model Name: {best_model_name}")
print(f"Test Accuracy: {best_accuracy*100:.3f}%")
print(f"Model Pipeline: {best_model}")
print(f"Accuracy: {best_accuracy:.2%}")


Registered model 'RandomForest' already exists. Creating a new version of this model...
2025/05/09 12:28:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest, version 4
Created version '4' of model 'RandomForest'.


Model: RandomForest
Test Accuracy: 95.458%

🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/206707251724781243/runs/2851142957f74a8cab3de24cff396fae
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/206707251724781243


Registered model 'GradientBoosting' already exists. Creating a new version of this model...
2025/05/09 12:29:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: GradientBoosting, version 4
Created version '4' of model 'GradientBoosting'.


Model: GradientBoosting
Test Accuracy: 95.458%

🏃 View run GradientBoosting at: http://127.0.0.1:5000/#/experiments/206707251724781243/runs/1fd89f95247c47b5b2309c0965636b97
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/206707251724781243


Registered model 'SVM' already exists. Creating a new version of this model...
2025/05/09 12:29:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SVM, version 4
Created version '4' of model 'SVM'.


Model: SVM
Test Accuracy: 93.573%

🏃 View run SVM at: http://127.0.0.1:5000/#/experiments/206707251724781243/runs/abd48bbd7fe3407487a5eaf166c8117d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/206707251724781243


Registered model 'LogisticRegression' already exists. Creating a new version of this model...
2025/05/09 12:29:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 4
Created version '4' of model 'LogisticRegression'.


Model: LogisticRegression
Test Accuracy: 91.774%

🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/206707251724781243/runs/3a44e5af488748c9a06b557c8681f8a2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/206707251724781243


Registered model 'DecisionTree' already exists. Creating a new version of this model...
2025/05/09 12:30:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTree, version 4
Created version '4' of model 'DecisionTree'.


Model: DecisionTree
Test Accuracy: 93.059%

🏃 View run DecisionTree at: http://127.0.0.1:5000/#/experiments/206707251724781243/runs/e76a67a92e0a41208711d52e80cebd14
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/206707251724781243


Registered model 'AdaBoost' already exists. Creating a new version of this model...
2025/05/09 12:30:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: AdaBoost, version 4
Created version '4' of model 'AdaBoost'.


Model: AdaBoost
Test Accuracy: 94.602%

🏃 View run AdaBoost at: http://127.0.0.1:5000/#/experiments/206707251724781243/runs/65064db4851f4ae988f7e701ba3a7d42
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/206707251724781243


Registered model 'XGBoost' already exists. Creating a new version of this model...
2025/05/09 12:32:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost, version 7
Created version '7' of model 'XGBoost'.


Model: XGBoost
Test Accuracy: 95.887%

🏃 View run XGBoost at: http://127.0.0.1:5000/#/experiments/206707251724781243/runs/d961dcc7992845a2b64371bb38dcae16
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/206707251724781243


Registered model 'NaiveBayes' already exists. Creating a new version of this model...
2025/05/09 12:32:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: NaiveBayes, version 4
Created version '4' of model 'NaiveBayes'.


Model: NaiveBayes
Test Accuracy: 88.260%

🏃 View run NaiveBayes at: http://127.0.0.1:5000/#/experiments/206707251724781243/runs/d14ca02cd95c4220a64b781ff606169e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/206707251724781243
Best Model:
Model Name: XGBoost
Test Accuracy: 95.887%
Model Pipeline: Pipeline(steps=[('scaler', MinMaxScaler()),
                ('model',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='logloss',
                               feature_types=None, feature_weights=None,
                               gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None,
           

In [None]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

model_name = best_model_name 
latest_version = max([model.version for model in client.search_model_versions(f"name='{model_name}'")])

# "Production"
client.transition_model_version_stage(
    name=model_name,
    version=latest_version,
    stage="Production"
)

print(f"Model {model_name} version {latest_version} promoted to Production")

Model XGBoost version 7 promoted to Production


In [53]:
import mlflow
from mlflow.tracking import MlflowClient

# Get the best run from the experiment sorted by F1 score
client = MlflowClient()
experiment = mlflow.get_experiment_by_name("Churn Prediction")
runs = client.search_runs(experiment.experiment_id, order_by=["metrics.f1_score DESC"])

# Pick the best run
best_run = runs[0]
best_model_uri = f"runs:/{best_run.info.run_id}/model"
best_model_name = best_run.data.params["model_name"]

# Register the best model
result = mlflow.register_model(model_uri=best_model_uri, name=best_model_name)

# Transition the model version to 'Production'
client.transition_model_version_stage(
    name=best_model_name,
    version=result.version,
    stage="Production"
)

print(f"✅ Best model '{best_model_name}' registered as version {result.version} and moved to Production.")

Registered model 'XGBoost' already exists. Creating a new version of this model...
2025/05/09 12:35:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost, version 8


✅ Best model 'XGBoost' registered as version 8 and moved to Production.


Created version '8' of model 'XGBoost'.


In [None]:
import pandas as pd
import joblib
from mlflow.sklearn import load_model

# 1. Load new data
df_new = pd.read_csv("C:/Users/hp/projects/Churn_Prediction_Project/data/new_customers.csv")

# 2. Drop 'customerID' if it exists
if 'customerID' in df_new.columns:
    df_new = df_new.drop(columns=['customerID']) 

# 3. Clean 'TotalCharges'
if 'TotalCharges' in df_new.columns:
    df_new['TotalCharges'] = pd.to_numeric(df_new['TotalCharges'], errors='coerce')
df_new = df_new.dropna()

# 4. Load the preprocessor (used during training)
preprocessor = joblib.load("C:/Users/hp/projects/Churn_Prediction_Project/src/models/preprocessor.pkl")

# 5. Transform the new data using the same preprocessor
X_new = preprocessor.transform(df_new)

# 6. Load the registered model
model = load_model("models:/XGBoost/Production")

# 7. Predict
predictions = model.predict(X_new)
predictions = ['Yes' if p==1 else 'No' for p in predictions]
print("Predictions:", predictions)

df_new['Predicted_Churn'] = predictions

print(df_new[['gender','tenure','MonthlyCharges','TotalCharges','Predicted_Churn']])

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 12.00it/s] 


Predictions: ['Yes', 'No', 'Yes', 'No', 'No']
   gender  tenure  MonthlyCharges  TotalCharges Predicted_Churn
0  Female       1           29.85         29.85             Yes
1    Male      34           56.95       1889.50              No
2    Male       2           53.85        108.15             Yes
3  Female      45           80.00       3650.00              No
4    Male       5           25.70        128.35              No


## Monitoring and Alerts

In [59]:
# Import necessary libraries for monitoring and alerts
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import os
import pandas as pd
import joblib
import mlflow

# Email settings for alerts
SMTP_SERVER = "smtp.gmail.com"  # SMTP server for Gmail
SMTP_PORT = 587  # Port for TLS
EMAIL_ADDRESS = "your_email@gmail.com"  # Replace with your email
EMAIL_PASSWORD = "your_app_password"  # Replace with your App Password
ALERT_RECIPIENT = "recipient_email@example.com"  # Replace with recipient email

# Function to send email alerts
def send_alert(subject, body):
    # Create a multipart email message
    msg = MIMEMultipart()
    msg['From'] = EMAIL_ADDRESS
    msg['To'] = ALERT_RECIPIENT
    msg['Subject'] = subject
    msg.attach(MIMEText(body, 'plain'))

    # Send the email using SMTP
    with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as server:
        server.starttls()
        server.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
        server.sendmail(EMAIL_ADDRESS, ALERT_RECIPIENT, msg.as_string())
        print(f"Alert sent to {ALERT_RECIPIENT}")

# Load reference and current data for monitoring
def load_data():
    # Load reference data (training dataset)
    reference_data = pd.read_csv('C:/Users/hp/projects/Churn_Prediction_Project/data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    reference_data['TotalCharges'] = pd.to_numeric(reference_data['TotalCharges'], errors='coerce').fillna(0)
    reference_data = reference_data.drop(columns=['customerID'])
    reference_data['Churn'] = reference_data['Churn'].map({'Yes': 1, 'No': 0})

    # Load current data (new data for prediction)
    current_data = pd.read_csv('C:/Users/hp/projects/Churn_Prediction_Project/data/new_customers.csv')
    if 'customerID' in current_data.columns:
        current_data = current_data.drop(columns=['customerID'])
    if 'TotalCharges' in current_data.columns:
        current_data['TotalCharges'] = pd.to_numeric(current_data['TotalCharges'], errors='coerce')
    current_data = current_data.dropna()

    return reference_data, current_data

# Load model and preprocessor
def load_model_and_preprocessor(model_name="XGBoost"):
    # Load the preprocessor
    preprocessor = joblib.load('C:/Users/hp/projects/Churn_Prediction_Project/src/models/preprocessor.pkl')
    # Load the model based on the model name
    model = joblib.load(f'C:/Users/hp/projects/Churn_Prediction_Project/src/models/{model_name}_model.pkl')
    return model, preprocessor

# Monitor model performance and detect drift
def monitor_model_performance(model_name="XGBoost"):
    # Load data for monitoring
    reference_data, current_data = load_data()

    # Separate features and target
    X_ref = reference_data.drop(columns=['Churn'])
    y_ref = reference_data['Churn']
    X_cur = current_data

    # Load model and preprocessor
    model, preprocessor = load_model_and_preprocessor(model_name)

    # Transform data using the preprocessor
    X_ref_transformed = preprocessor.transform(X_ref)
    X_cur_transformed = preprocessor.transform(X_cur)

    # Make predictions on reference data
    y_ref_pred = model.predict(X_ref_transformed)
    y_ref_proba = model.predict_proba(X_ref_transformed)[:, 1]

    # Add predictions to datasets
    reference_data['prediction'] = y_ref_pred
    current_data['prediction'] = model.predict(X_cur_transformed)

    # Set up column mapping for Evidently
    column_mapping = ColumnMapping()
    column_mapping.target = 'Churn'
    column_mapping.prediction = 'prediction'
    column_mapping.numerical_features = X_ref.select_dtypes(include=['int64', 'float64']).columns.tolist()
    column_mapping.categorical_features = X_ref.select_dtypes(include=['object']).columns.tolist()

    # Generate drift report using Evidently
    drift_report = Report(metrics=[
        DataDriftPreset(),
        TargetDriftPreset()
    ])
    drift_report.run(reference_data=reference_data, current_data=current_data, column_mapping=column_mapping)

    # Save the drift report as HTML
    os.makedirs('reports', exist_ok=True)
    drift_report.save_html('reports/drift_report.html')

    # Extract drift detection results
    drift_detected = drift_report.as_dict()['metrics'][0]['result']['dataset_drift']  # Data Drift
    target_drift_detected = drift_report.as_dict()['metrics'][1]['result']['drift_detected']  # Target Drift

    # Log metrics to MLflow
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    mlflow.set_experiment("Churn Prediction Monitoring")
    with mlflow.start_run(run_name=f"Monitoring_{model_name}"):
        mlflow.log_artifact('reports/drift_report.html')

    # Send alert if drift is detected
    if drift_detected or target_drift_detected:
        subject = "Warning: Model Drift Detected"
        body = f"""
        Drift detected in model {model_name} performance.
        - Data Drift: {drift_detected}
        - Target Drift: {target_drift_detected}
        Check the drift report at reports/drift_report.html
        """
        send_alert(subject, body)

    print(f"Drift report generated at reports/drift_report.html")
    print(f"Data Drift Detected: {drift_detected}")
    print(f"Target Drift Detected: {target_drift_detected}")

if __name__ == "__main__":
    # Run the monitoring for the best model
    monitor_model_performance(model_name="XGBoost")

ModuleNotFoundError: No module named 'evidently'