In [56]:
!pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn xgboost psycopg2-binary scipy sqlalchemy "evidently<0.4.0" mlflow joblib



In [57]:
pip install shap

Note: you may need to restart the kernel to use updated packages.


In [58]:
# ==============================
# 📦 Data Handling
# ==============================
import os
import json
import time
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy.stats import zscore

# ==============================
# 🗄️ Database Connection
# ==============================
from sqlalchemy import create_engine
import psycopg2

# ==============================
# 🔧 Preprocessing & Pipeline
# ==============================
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer

# ==============================
# 🤖 ML Models
# ==============================
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ==============================
# 📊 Metrics
# ==============================
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

# ==============================
# 📈 Model Selection
# ==============================
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)

# ==============================
# 🎨 Visualization
# ==============================
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 🔍 Explainability
# ==============================
import shap

# ==============================
# 📈 Monitoring with Evidently
# ==============================
from evidently.report import Report
from evidently.metric_preset import (
    DataDriftPreset,
    TargetDriftPreset,
    DataQualityPreset
)

# ==============================
# 🚀 MLflow Tracking
# ==============================
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

# ==============================
# 💾 Model Persistence
# ==============================
import joblib

# ==============================
# 🌐 API Deployment
# ==============================
from flask import Flask, request, jsonify

# ==============================
# (Optional) AWS SageMaker SDK
# ==============================
import sagemaker
import boto3
import time

In [59]:

# --- Configuration ---
region = 'ap-south-1'  # change to your region
workgroup_name = 'redshift'
database_name = 'dev'
secret_arn = 'arn:aws:secretsmanager:ap-south-1:888517279277:secret:Redshiftsecret-oLjktj'
sql = 'SELECT * FROM leads LIMIT 100'
 
# --- Create boto3 Redshift Data API client ---
client = boto3.client('redshift-data', region_name=region)
 
# --- Execute query ---
response = client.execute_statement(
    WorkgroupName='redshift',
    Database='dev',
    SecretArn='arn:aws:secretsmanager:ap-south-1:888517279277:secret:Redshiftsecret-oLjktj',
    Sql=sql
)
 
statement_id = response['Id']
 
# --- Wait for completion ---
desc = client.describe_statement(Id=statement_id)
while desc['Status'] not in ['FINISHED', 'FAILED', 'ABORTED']:
    time.sleep(1)
    desc = client.describe_statement(Id=statement_id)
 
if desc['Status'] != 'FINISHED':
    raise Exception(f"Query failed: {desc}")
 
# --- Retrieve results ---
result = client.get_statement_result(Id=statement_id)
 
# --- Convert to pandas DataFrame ---
columns = [col['name'] for col in result['ColumnMetadata']]
rows = result['Records']
 
data = []
for row in rows:
    data.append([list(col.values())[0] if col else None for col in row])
 
df = pd.DataFrame(data, columns=columns)
print(df.head())

     city  converted do_not_call  total_time_spent_on_website magazine  \
0  Select          0          No                            0       No   
1  Select          0          No                          674       No   
2  Mumbai          1          No                         1532       No   
3  Mumbai          0          No                          305       No   
4  Mumbai          1          No                         1428       No   

  x_education_forums i_agree_to_pay_the_amount_through_cheque  \
0                 No                                       No   
1                 No                                       No   
2                 No                                       No   
3                 No                                       No   
4                 No                                       No   

  through_recommendations what_matters_most_to_you_in_choosing_a_course  \
0                      No                       Better Career Prospects   
1             

In [60]:
df.head()

Unnamed: 0,city,converted,do_not_call,total_time_spent_on_website,magazine,x_education_forums,i_agree_to_pay_the_amount_through_cheque,through_recommendations,what_matters_most_to_you_in_choosing_a_course,a_free_copy_of_mastering_the_interview,...,lead_number,asymmetrique_profile_index,lead_profile,newspaper_article,lead_origin,last_notable_activity,page_views_per_visit,newspaper,asymmetrique_activity_index,specialization
0,Select,0,No,0,No,No,No,No,Better Career Prospects,No,...,660737,02.Medium,Select,No,API,Modified,0.0,No,02.Medium,Select
1,Select,0,No,674,No,No,No,No,Better Career Prospects,No,...,660728,02.Medium,Select,No,API,Email Opened,2.5,No,02.Medium,Select
2,Mumbai,1,No,1532,No,No,No,No,Better Career Prospects,Yes,...,660727,01.High,Potential Lead,No,Landing Page Submission,Email Opened,2.0,No,02.Medium,Business Administration
3,Mumbai,0,No,305,No,No,No,No,Better Career Prospects,No,...,660719,01.High,Select,No,Landing Page Submission,Modified,1.0,No,02.Medium,Media and Advertising
4,Mumbai,1,No,1428,No,No,No,No,Better Career Prospects,No,...,660681,01.High,Select,No,Landing Page Submission,Modified,1.0,No,02.Medium,Select


In [61]:
df.describe()

Unnamed: 0,converted,total_time_spent_on_website,asymmetrique_activity_score,asymmetrique_profile_score,lead_number
count,100.0,100.0,100.0,100.0,100.0
mean,0.38,535.44,14.07,16.27,660109.91
std,0.487832,526.985621,1.622163,1.911449,371.216757
min,0.0,0.0,9.0,13.0,659507.0
25%,0.0,94.0,13.0,15.0,659781.25
50%,0.0,322.5,14.0,16.0,660107.5
75%,1.0,1008.25,15.0,18.0,660449.75
max,1.0,1710.0,17.0,20.0,660737.0


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   city                                           100 non-null    object 
 1   converted                                      100 non-null    int64  
 2   do_not_call                                    100 non-null    object 
 3   total_time_spent_on_website                    100 non-null    int64  
 4   magazine                                       100 non-null    object 
 5   x_education_forums                             100 non-null    object 
 6   i_agree_to_pay_the_amount_through_cheque       100 non-null    object 
 7   through_recommendations                        100 non-null    object 
 8   what_matters_most_to_you_in_choosing_a_course  100 non-null    object 
 9   a_free_copy_of_mastering_the_interview         100 non-

In [63]:
print(df.columns.tolist())

['city', 'converted', 'do_not_call', 'total_time_spent_on_website', 'magazine', 'x_education_forums', 'i_agree_to_pay_the_amount_through_cheque', 'through_recommendations', 'what_matters_most_to_you_in_choosing_a_course', 'a_free_copy_of_mastering_the_interview ', 'search', 'asymmetrique_activity_score', 'receive_more_updates_about_our_courses', 'lead_source', 'how_did_you_hear_about_x_education', 'prospect_id', 'get_updates_on_dm_content', 'update_me_on_supply_chain_content', 'tags', 'totalvisits', 'do_not_email', 'digital_advertisement', 'lead_quality', 'country', 'last_activity', 'what_is_your_current_occupation', 'asymmetrique_profile_score', 'lead_number', 'asymmetrique_profile_index', 'lead_profile', 'newspaper_article', 'lead_origin', 'last_notable_activity', 'page_views_per_visit', 'newspaper', 'asymmetrique_activity_index', 'specialization']


In [64]:
#Dropping Unwanted Columns
columns_to_drop = [
    'prospect_id',
    'lead_number',
    'get_updates_on_dm_content',
    'receive_more_updates_about_our_courses',
    'i_agree_to_pay_the_amount_through_cheque',
    'magazine',
    'update_me_on_supply_chain_content'
]

df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

print(df.head())
print(df.shape)


     city  converted do_not_call  total_time_spent_on_website  \
0  Select          0          No                            0   
1  Select          0          No                          674   
2  Mumbai          1          No                         1532   
3  Mumbai          0          No                          305   
4  Mumbai          1          No                         1428   

  x_education_forums through_recommendations  \
0                 No                      No   
1                 No                      No   
2                 No                      No   
3                 No                      No   
4                 No                      No   

  what_matters_most_to_you_in_choosing_a_course  \
0                       Better Career Prospects   
1                       Better Career Prospects   
2                       Better Career Prospects   
3                       Better Career Prospects   
4                       Better Career Prospects   

  a_free_copy

In [65]:
def map_categorical_columns(df):
    df = df.copy()
    city_map = {
        'Mumbai': 'Metro India', 'Thane & Outskirts': 'Metro India', 'Other Metro Cities': 'Metro India',
        'Other Cities of Maharashtra': 'Tier II India', 'Tier II Cities': 'Tier II India', 'Other Cities': 'Other India',
        'Select': 'Unknown', 'nan': 'Unknown', pd.NA: 'Unknown', None: 'Unknown'
    }
    country_map = {
        'India': 'India', 'United States': 'North America', 'Canada': 'North America',
        'United Arab Emirates': 'Middle East', 'Saudi Arabia': 'Middle East', 'Qatar': 'Middle East', 'Kuwait': 'Middle East',
        'Oman': 'Middle East', 'Bahrain': 'Middle East',
        'Germany': 'Europe', 'France': 'Europe', 'United Kingdom': 'Europe', 'Sweden': 'Europe',
        'Belgium': 'Europe', 'Netherlands': 'Europe', 'Switzerland': 'Europe',
        'China': 'Asia', 'Singapore': 'Asia', 'Hong Kong': 'Asia', 'Philippines': 'Asia', 'Vietnam': 'Asia',
        'South Africa': 'Africa', 'Nigeria': 'Africa',
        'nan': 'Unknown', 'unknown': 'Other'
    }
    specialization_map = {
        'Marketing Management': 'Marketing', 'Operations Management': 'Operations', 'Finance Management': 'Finance',
        'Human Resource Management': 'HR', 'International Business': 'Business', 'Business Administration': 'Business', 'MBA': 'Business',
        'IT Projects Management': 'IT', 'E-Business': 'E-Commerce', 'E-Commerce': 'E-Commerce', 'E Commerce': 'E-Commerce',
        'Supply Chain Management': 'Operations', 'Retail Management': 'Operations', 'Banking, Investment And Insurance': 'Finance',
        'Healthcare Management': 'Healthcare', 'Hospitality Management': 'Healthcare',
        'Rural and Agribusiness': 'Other', 'Travel and Tourism': 'Other', 'Media and Advertising': 'Other', 'Services Excellence': 'Other',
        'Not Specified': 'Unknown', 'Select': 'Unknown', 'nan': 'Unknown', None: 'Unknown'
    }
    education_source_map = {
        'Online Search': 'Digital', 'Advertisements': 'Digital', 'Email': 'Digital', 'SMS': 'Digital', 'Social Media': 'Digital',
        'Word Of Mouth': 'Referral', 'Student of SomeSchool': 'Referral',
        'Multiple Sources': 'Multi-Channel', 'Other': 'Other', 'Select': 'Unknown', 'nan': 'Unknown', None: 'Unknown'
    }
    occupation_map = {
        'Student': 'Student', 'Working Professional': 'Working', 'Businessman': 'Working',
        'Housewife': 'Non-Working', 'Unemployed': 'Non-Working', 'Other': 'Unknown', 'Select': 'Unknown', 'nan': 'Unknown', None: 'Unknown'
    }
    tags_map = {
        'Will revert after reading the email': 'Pending Response', 'Still Thinking': 'Pending Response', 'Interested in full time MBA': 'Pending Response',
        'Ringing': 'Trying to Contact', 'Busy': 'Trying to Contact', 'switched off': 'Trying to Contact', 'opp hangup': 'Trying to Contact',
        'Interested in other courses': 'Not Interested', 'Already a student': 'Not Interested', 'Lost to EINS': 'Not Interested', 'Lost to Others': 'Not Interested', 'Not doing further education': 'Not Interested',
        'invalid number': 'Invalid Contact', 'wrong number given': 'Invalid Contact', 'number not provided': 'Invalid Contact',
        'Diploma holder (Not Eligible)': 'Not Eligible', 'Graduation in progress': 'Not Eligible',
        'Closed by Horizzon': 'Converted', 'Want to take admission but has financial problems': 'Financial Issue', 'in touch with EINS': 'Transferred'
    }
    lead_quality_map = {
        'High in Relevance': 'High', 'Might be': 'Medium', 'Not Sure': 'Medium', 'Low in Relevance': 'Low', 'Worst': 'Low', 'nan': 'Unknown', None: 'Unknown'
    }
    lead_profile_map = {
        'Potential Lead': 'Prospective', 'Other Leads': 'Prospective', 'Student of SomeSchool': 'Converted',
        'Lateral Student': 'Converted', 'Dual Specialization Student': 'Converted', 'Select': 'Unknown', 'nan': 'Unknown', None: 'Unknown'
    }

    mappings = {
        'city': city_map,
        'country': country_map,
        'specialization': specialization_map,
        'how_did_you_hear_about_x_education': education_source_map,
        'what_is_your_current_occupation': occupation_map,
        'tags': tags_map,
        'lead_quality': lead_quality_map,
        'lead_profile': lead_profile_map
    }

    for col, mapping in mappings.items():
        if col in df.columns:
            df[col] = df[col].apply(lambda x: mapping.get(x, 'Unknown') if pd.notna(x) else 'Unknown')
            df[col] = df[col].fillna('Unknown')
    return df

mapping_transformer = FunctionTransformer(map_categorical_columns, validate=False)


In [None]:
def replace_unknowns_with_nan(df):
    df = df.copy()
    for col in df.select_dtypes(include=['object', 'category']).columns:
        df[col] = df[col].replace(r'(?i)unknown', np.nan, regex=True)
    return df

def handle_skewness_yeojohnson(df, threshold=0.5):
    df = df.copy()
    num_cols = df.select_dtypes(include='number').columns
    # Initialize transformer for Yeo-Johnson
    pt = PowerTransformer(method='yeo-johnson')
    for col in num_cols:
        if abs(df[col].skew()) > threshold:
            # Reshape and transform the column
            reshaped = df[col].values.reshape(-1, 1)
            try:
                df[col] = pt.fit_transform(reshaped)
            except Exception as e:
                print(f"Skipping column {col}: {e}")
    return df

replace_unknowns_transformer = FunctionTransformer(replace_unknowns_with_nan, validate=False)
skewness_transformer = FunctionTransformer(handle_skewness_yeojohnson, validate=False)


In [67]:
def build_full_pipeline(df):
    target_col = "converted"

    label_cols = ["lead_quality","asymmetrique_activity_index","asymmetrique_profile_index"]

    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    if target_col in numeric_cols:
        numeric_cols.remove(target_col)
    if target_col in categorical_cols:
        categorical_cols.remove(target_col)
    if target_col in label_cols:
        label_cols.remove(target_col)

    cat_ohe_cols = [col for col in categorical_cols if col not in label_cols]

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    cat_ohe_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
    ])

    cat_label_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('label', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_pipeline, numeric_cols),
        ('cat_ohe', cat_ohe_pipeline, cat_ohe_cols),
        ('cat_label', cat_label_pipeline, label_cols)
    ], remainder='passthrough')

    full_pipeline = Pipeline([
        ('mapping', mapping_transformer),
        ('replace_unknowns', replace_unknowns_transformer),
        ('skewness', skewness_transformer),
        ('preprocessor', preprocessor)
    ])

    full_pipeline.set_output(transform='default')
    return full_pipeline


In [68]:
def evaluate_classification_metrics(y_true, y_pred, y_proba=None, average_type='binary'):
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average=average_type, zero_division=0),
        "recall": recall_score(y_true, y_pred, average=average_type, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, average=average_type, zero_division=0)
    }
    if y_proba is not None:
        try:
            metrics["roc_auc"] = roc_auc_score(y_true, y_proba)
        except ValueError:
            metrics["roc_auc"] = None
    else:
        metrics["roc_auc"] = None
    return metrics

def print_classification_report(y_true, y_pred):
    print("\n✅ Classification Report:")
    print(classification_report(y_true, y_pred))
    print("\n✅ Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))


In [69]:
def train_log_and_shap_classification(
    X_train, y_train, X_val, y_val, preprocessor,
    save_dir="saved_models", shap_dir="shap_outputs"
):
    models = {
        'LogisticRegression': {
            'model': LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42),
            'params': {'C': [0.1, 1.0, 10.0]}
        },
        'DecisionTree': {
            'model': DecisionTreeClassifier(class_weight='balanced', random_state=42),
            'params': {'max_depth': [5, 10, None], 'min_samples_split': [2, 5]}
        },
        'RandomForest': {
            'model': RandomForestClassifier(class_weight='balanced', random_state=42),
            'params': {'n_estimators': [100, 200], 'max_depth': [None, 10]}
        },
        'XGBoost': {
            'model': XGBClassifier(scale_pos_weight=1, use_label_encoder=False, eval_metric='logloss', random_state=42),
            'params': {'n_estimators': [100, 200], 'max_depth': [3, 6]}
        },
        'LightGBM': {
            'model': LGBMClassifier(class_weight='balanced', random_state=42, verbose=-1),
            'params': {'n_estimators': [100, 200], 'max_depth': [3, 6]}
        }
    }

    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(shap_dir, exist_ok=True)

    mlflow.set_tracking_uri("arn:aws:sagemaker:ap-south-1:888517279277:mlflow-tracking-server/capstone")
    mlflow.set_experiment("LeadScoring_Simplified")

    results = []
    best_models = {}

    for name, model_info in models.items():
        print(f"\n🔧 Training: {name}")

        pipeline = Pipeline([
            ('preprocess', preprocessor),
            ('model', model_info['model'])
        ])

        param_grid = {f"model__{k}": v for k, v in model_info['params'].items()}
        search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )
        search.fit(X_train, y_train)

        y_val_pred = search.predict(X_val)
        y_val_proba = search.predict_proba(X_val)[:, 1] if hasattr(search.best_estimator_.named_steps['model'], "predict_proba") else None

        metrics = evaluate_classification_metrics(y_val, y_val_pred, y_val_proba)
        results.append({"model": name, "best_params": search.best_params_, **metrics})
        best_models[name] = search.best_estimator_

        model_path = os.path.join(save_dir, f"{name}_best_model.pkl")
        joblib.dump(search.best_estimator_, model_path)

        with mlflow.start_run(run_name=name):
            mlflow.log_params(search.best_params_)
            mlflow.log_metrics(metrics)
            mlflow.sklearn.log_model(search.best_estimator_, "model")

            try:
                print(f"🔎 Generating SHAP values for {name}...")
                fitted_preprocessor = search.best_estimator_.named_steps['preprocess']
                X_val_proc = fitted_preprocessor.transform(X_val)
                shap_matrix = X_val_proc.toarray() if hasattr(X_val_proc, "toarray") else X_val_proc
                model_only = search.best_estimator_.named_steps['model']
                if name in ("RandomForest", "XGBoost", "LightGBM", "DecisionTree"):
                    explainer = shap.TreeExplainer(model_only)
                else:
                    explainer = shap.Explainer(model_only, shap_matrix)
                shap_values = explainer(shap_matrix)
                shap_path = os.path.join(shap_dir, f"{name}_shap_summary.png")
                plt.figure()
                shap.summary_plot(shap_values, shap_matrix, show=False)
                plt.savefig(shap_path, bbox_inches='tight')
                plt.close()
                mlflow.log_artifact(shap_path, artifact_path="shap_plots")
                print(f"✅ SHAP saved & logged: {shap_path}")
            except Exception as e:
                print(f"⚠️ SHAP failed for {name}: {e}")

    results_df = pd.DataFrame(results)
    print("\n📊 All Model Validation Metrics:")
    print(results_df[["model", "accuracy", "precision", "recall", "f1_score", "roc_auc"]].to_string(index=False))

    return results_df, best_models


In [70]:
def save_and_register_best_model_pipeline(
    results_df, best_models, X_train_val, y_train_val, preprocessor,
    save_dir="saved_models", experiment_name="LeadScoring_Simplified"
):
    os.makedirs(save_dir, exist_ok=True)

    # 1. Select best model
    best_row = results_df.sort_values(by="f1_score", ascending=False).iloc[0]
    best_model_name = best_row["model"]
    best_model = best_models[best_model_name]
    print(f"\n🏆 Best model selected: {best_model_name} (F1 = {best_row['f1_score']:.4f})")

    # 2. Build final pipeline
    final_pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("model", best_model.named_steps['model'] if hasattr(best_model, 'named_steps') else best_model)
    ])
    final_pipeline.fit(X_train_val, y_train_val)

    # 3. Save final model pipeline locally
    model_path = os.path.join(save_dir, f"final_{best_model_name}_pipeline.pkl")
    joblib.dump(final_pipeline, model_path)
    print(f"✅ Final pipeline saved at: {model_path}")

    # 4. Save preprocessor pipeline locally
    preprocessor_path = os.path.join(save_dir, "final_preprocessor.pkl")
    joblib.dump(preprocessor, preprocessor_path)
    print(f"✅ Preprocessing pipeline saved at: {preprocessor_path}")

    # 5. Register to MLflow
    mlflow.set_tracking_uri("arn:aws:sagemaker:ap-south-1:888517279277:mlflow-tracking-server/capstone")
    mlflow.set_experiment(experiment_name)
    client = MlflowClient()

    with mlflow.start_run(run_name=f"Final_{best_model_name}") as run:
        run_id = run.info.run_id

        # ✅ Log the pipeline to MLflow
        mlflow.sklearn.log_model(final_pipeline, artifact_path="model")
        print(f"🔁 Registering model to MLflow Model Registry: {best_model_name}")

        # Register the model
        model_uri = f"runs:/{run_id}/model"
        registered_model = mlflow.register_model(model_uri=model_uri, name=best_model_name)

        # Wait a bit for the registration to complete
        time.sleep(10)

        # Transition model version to Production
        client.transition_model_version_stage(
            name=best_model_name,
            version=registered_model.version,
            stage="Production",
            archive_existing_versions=True
        )
        print(f"✅ Model '{best_model_name}' version {registered_model.version} moved to 'Production'.")

        # Optionally set alias
        try:
            client.set_model_version_alias(
                name=best_model_name,
                version=registered_model.version,
                alias="champion"
            )
            print(f"🏷️ Alias 'champion' assigned to version {registered_model.version}.")
        except Exception as e:
            print(f"⚠️ Unable to set alias 'champion': {e}")

        # Print MLflow run link
        print(f"🏃 View run: http://localhost:5000/#/experiments/{run.info.experiment_id}/runs/{run_id}")

    # ✅ Return final pipeline, best model name, and path
    return final_pipeline, best_model_name, model_path


In [71]:
def generate_and_log_drift_reports(
    X_train, X_val, X_test,
    feature_names=None,
    output_dir="drift_reports",
    mlflow_uri="arn:aws:sagemaker:ap-south-1:888517279277:mlflow-tracking-server/capstone",
    experiment_name="Drift"
):
    """
    Generates Evidently Data Drift reports comparing train/val/test,
    saves them as HTML, and logs both artifacts and metrics into MLflow.

    Returns
    -------
    dict
        A dictionary with drift metrics for each comparison.
    """

    # Helper to ensure DataFrame
    def ensure_df(data, feature_names):
        if isinstance(data, pd.DataFrame):
            return data
        cols = feature_names if feature_names is not None else [f"feature_{i}" for i in range(data.shape[1])]
        return pd.DataFrame(data, columns=cols)

    X_train = ensure_df(X_train, feature_names)
    X_val = ensure_df(X_val, feature_names)
    X_test = ensure_df(X_test, feature_names)

    os.makedirs(output_dir, exist_ok=True)

    mlflow.set_tracking_uri(mlflow_uri)
    mlflow.set_experiment(experiment_name)

    comparisons = [
        ("train_vs_val", X_train, X_val),
        ("train_vs_test", X_train, X_test),
        ("val_vs_test", X_val, X_test)
    ]

    results_summary = {}

    with mlflow.start_run(run_name="multi_split_drift") as run:
        for name, ref, curr in comparisons:
            print(f"🚀 Running drift check: {name}")
            report = Report(metrics=[DataDriftPreset()])
            report.run(reference_data=ref, current_data=curr)

            # Save HTML artifact
            html_path = os.path.join(output_dir, f"{name}.html")
            report.save_html(html_path)
            mlflow.log_artifact(html_path, artifact_path="evidently_html_reports")

            json_dict = report.as_dict()
            drift_result = next((m["result"] for m in json_dict["metrics"] if m.get("metric") == "DataDriftTable"), None)

            if drift_result:
                drift_ratio = drift_result.get("share_of_drifted_columns", 0)
                mlflow.log_metric(f"{name}_drift_ratio", round(drift_ratio, 4))

                column_metrics = {}
                for feature, vals in drift_result.get("drift_by_columns", {}).items():
                    score = vals.get("drift_score")
                    if score is not None:
                        clean_name = feature.replace(" ", "_").replace("(", "").replace(")", "")
                        mlflow.log_metric(f"{name}_{clean_name}", round(score, 4))
                        column_metrics[feature] = round(score, 4)

                results_summary[name] = {
                    "drift_ratio": round(drift_ratio, 4),
                    "column_scores": column_metrics
                }

            print(f"✅ Drift metrics for {name} logged to MLflow.\n")

        print(f"🎯 Drift reports & metrics logged under run ID: {run.info.run_id}")

    return results_summary


In [72]:
def get_latest_production_model_name(stage="Production", alias=None):
    """
    Finds the latest-registered model name in a given MLflow stage or alias.
    Args:
        stage (str): MLflow stage ("Production", "Staging", etc).
        alias (str): MLflow alias (e.g. "champion", optional).
    Returns:
        str: model_name
    """
    client = MlflowClient()
    registered = client.search_registered_models()
    if not registered:
        raise RuntimeError("No models registered in MLflow!")

    # Candidates = list of (model_name, version, timestamp)
    candidates = []
    for m in registered:
        for lv in m.latest_versions:
            # Choose model by alias (if provided and MLflow>=2.3) or by stage
            if alias:
                aliases = getattr(lv, 'aliases', [])
                if alias in aliases:
                    candidates.append((m.name, lv.version, lv.creation_timestamp))
            else:
                if lv.current_stage == stage:
                    candidates.append((m.name, lv.version, lv.creation_timestamp))

    if not candidates:
        raise ValueError(f"No model found in MLflow registry for stage='{stage}' alias='{alias}'")

    # Sort by creation time descending (latest first)
    candidates.sort(key=lambda t: t[2], reverse=True)
    chosen_model = candidates[0][0]
    print(f"✅ Will load {chosen_model} version {candidates[0][1]} (stage/alias: '{alias or stage}')")
    return chosen_model


def load_and_predict_from_registry_auto(X_test, stage="Production", alias=None):
    """
    Loads the latest pipeline from MLflow given a stage/alias, predicts on X_test.
    Args:
        X_test : raw test DataFrame
        stage  : MLflow stage (default "Production"), ignored if alias given
        alias  : MLflow alias (e.g. "champion") if using version aliasing
    Returns:
        np.array: Model predictions
    """
    model_name = get_latest_production_model_name(stage=stage, alias=alias)
    model_uri = f"models:/{model_name}/{alias or stage}"
    print(f"📦 Loading from {model_uri}")
    loaded_pipeline = mlflow.sklearn.load_model(model_uri)
    predictions = loaded_pipeline.predict(X_test)
    print(f"✅ Predictions complete. Example: {predictions[:5]}")
    return predictions


# 👉 Example usage
# If your best model is in Staging instead of Production:

# Or, if you have set an alias:
# y_pred = load_and_predict_from_registry_auto(X_test_raw, alias="champion")


In [73]:
def run_lead_prediction_pipeline(
    df,
    experiment_name="LeadScoring_Simplified",
    save_dir="saved_models",
    shap_dir="shap_outputs",
    drift_dir="drift_reports"
):
    # ✅ Normalize column names
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # ✅ Drop unwanted columns
    drop_cols = [
        'prospect_id',
        'lead_number',
        'get_updates_on_dm_content',
        'receive_more_updates_about_our_courses',
        'i_agree_to_pay_the_amount_through_cheque',
        'magazine',
        'update_me_on_supply_chain_content'
    ]
    df = df.drop(columns=drop_cols, errors='ignore')

    # ✅ Build preprocessor
    preprocessor = build_full_pipeline(df)

    # ✅ Split
    from sklearn.model_selection import train_test_split
    target_col = "converted"  # already lowercase
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
    )

    # ✅ Data drift
    feature_names = X_train.columns
    generate_and_log_drift_reports(
        X_train, X_val, X_test,
        feature_names=feature_names,
        output_dir=drift_dir,
        mlflow_uri="arn:aws:sagemaker:ap-south-1:888517279277:mlflow-tracking-server/capstone",
        experiment_name="Drift"
    )

    # ✅ Train + SHAP
    results_df, best_models = train_log_and_shap_classification(
        X_train, y_train, X_val, y_val, preprocessor,
        save_dir=save_dir, shap_dir=shap_dir
    )

    # ✅ Save & register
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])
    save_and_register_best_model_pipeline(
        results_df, best_models, X_train_val, y_train_val, preprocessor,
        save_dir=save_dir, experiment_name=experiment_name
    )
    y_pred = load_and_predict_from_registry_auto(X_test, stage="Production")
    print(y_pred)
run_lead_prediction_pipeline(df)


🚀 Running drift check: train_vs_val
✅ Drift metrics for train_vs_val logged to MLflow.

🚀 Running drift check: train_vs_test
✅ Drift metrics for train_vs_test logged to MLflow.

🚀 Running drift check: val_vs_test
✅ Drift metrics for val_vs_test logged to MLflow.

🎯 Drift reports & metrics logged under run ID: 6f961759df85422e99f9e65944dfce43
🏃 View run multi_split_drift at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1/runs/6f961759df85422e99f9e65944dfce43
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1

🔧 Training: LogisticRegression
Fitting 5 folds for each of 3 candidates, totalling 15 fits




🔎 Generating SHAP values for LogisticRegression...
✅ SHAP saved & logged: shap_outputs/LogisticRegression_shap_summary.png
🏃 View run LogisticRegression at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2/runs/1a03a64197cd4f17a6d9d92d043c8bff
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2

🔧 Training: DecisionTree
Fitting 5 folds for each of 6 candidates, totalling 30 fits




🔎 Generating SHAP values for DecisionTree...
✅ SHAP saved & logged: shap_outputs/DecisionTree_shap_summary.png
🏃 View run DecisionTree at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2/runs/1f27da0572af4eb8b306ed72c248249e
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2

🔧 Training: RandomForest
Fitting 5 folds for each of 4 candidates, totalling 20 fits




🔎 Generating SHAP values for RandomForest...
✅ SHAP saved & logged: shap_outputs/RandomForest_shap_summary.png
🏃 View run RandomForest at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2/runs/2579aafb56744a36b352948c2c56507f
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2

🔧 Training: XGBoost
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

🔎 Generating SHAP values for XGBoost...
✅ SHAP saved & logged: shap_outputs/XGBoost_shap_summary.png
🏃 View run XGBoost at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2/runs/f57cddfc342d4cf9b254b23c0960ef16
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2

🔧 Training: LightGBM
Fitting 5 folds for each of 4 candidates, totalling 20 fits




🔎 Generating SHAP values for LightGBM...
✅ SHAP saved & logged: shap_outputs/LightGBM_shap_summary.png
🏃 View run LightGBM at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2/runs/6b61e77c73dc42a58812332866ebcbcf
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2

📊 All Model Validation Metrics:
             model  accuracy  precision   recall  f1_score  roc_auc
LogisticRegression      0.35   0.350000 1.000000  0.518519 1.000000
      DecisionTree      0.65   0.500000 0.285714  0.363636 0.510989
      RandomForest      0.90   1.000000 0.714286  0.833333 0.961538
           XGBoost      0.85   0.833333 0.714286  0.769231 0.923077
          LightGBM      0.80   0.714286 0.714286  0.714286 0.846154

🏆 Best model selected: RandomForest (F1 = 0.8333)
✅ Final pipeline saved at: saved_models/final_RandomForest_pipeline.pkl
✅ Preprocessing pipeline saved at: saved_models/final_preprocessor.pkl




🔁 Registering model to MLflow Model Registry: RandomForest


Registered model 'RandomForest' already exists. Creating a new version of this model...
2025/07/20 04:52:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest, version 5
Created version '5' of model 'RandomForest'.


✅ Model 'RandomForest' version 5 moved to 'Production'.
⚠️ Unable to set alias 'champion': 'MlflowClient' object has no attribute 'set_model_version_alias'
🏃 View run: http://localhost:5000/#/experiments/2/runs/36f5741add7045519a726b571b10a43d
🏃 View run Final_RandomForest at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2/runs/36f5741add7045519a726b571b10a43d
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/2
✅ Will load RandomForest version 5 (stage/alias: 'Production')
📦 Loading from models:/RandomForest/Production


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Predictions complete. Example: [1 1 0 1 0]
[1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1]


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [39]:
# Session & Bucket
session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = "sagemaker/leadprediction"

In [47]:
model_path = session.upload_data(
    path="saved_models/final_RandomForest_pipeline.pkl",
    bucket=bucket,
    key_prefix=prefix
)
print("Pipeline uploaded to:", model_path)

Pipeline uploaded to: s3://sagemaker-ap-south-1-888517279277/sagemaker/leadprediction/final_RandomForest_pipeline.pkl


In [48]:
import tarfile

with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("saved_models/final_RandomForest_pipeline.pkl", arcname="final_RandomForest_pipeline.pkl")


In [49]:
model_archive_path = session.upload_data("model.tar.gz", bucket=bucket, key_prefix=prefix)
print("Model archive uploaded to:", model_archive_path)


Model archive uploaded to: s3://sagemaker-ap-south-1-888517279277/sagemaker/leadprediction/model.tar.gz


In [50]:
role = sagemaker.get_execution_role()

In [54]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=model_archive_path,  # S3 path to model.tar.gz
    role=role,                      # IAM role for SageMaker
    entry_point="inference.py",     # Your custom inference script
    framework_version="0.23-1",     # or update to latest if needed
    py_version="py3"
)


In [55]:
predictor = model.deploy(initial_instance_count=1, instance_type="ml.m5.large")

---------------------------------------------*

ERROR:sagemaker:Please check the troubleshooting guide for common errors: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html#sagemaker-python-sdk-troubleshooting-create-endpoint
