In [6]:
# ==============================
# 📦 Data Handling
# ==============================
import os
import json
import warnings
import pandas as pd
import numpy as np
from scipy.stats import zscore

# ==============================
# 🗄️ Database Connection
# ==============================
import psycopg2
from sqlalchemy import create_engine

# ==============================
# 🔧 Preprocessing & ML
# ==============================
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ==============================
# 📊 Metrics
# ==============================
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
# if you have your own metric helpers:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    log_loss
)

# ==============================
# 🎨 Visualization
# ==============================
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 🔍 Explainability
# ==============================
import shap

# ==============================
# 📈 Monitoring with Evidently
# ==============================
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

# ==============================
# 🚀 MLflow Tracking
# ==============================
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

# ==============================
# 💾 Model Persistence
# ==============================
import joblib

# ==============================
# ⚠️ Suppress warnings
# ==============================
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning, module='_distutils_hack')


  from .autonotebook import tqdm as notebook_tqdm
  from google.protobuf import service as _service


In [None]:

def data_ingestion(csv_file_path, table_name='lead'):
    """
    This function uploads data from a CSV file to a PostgreSQL table
    and retrieves the data back into a Pandas DataFrame.

    Parameters:
        csv_file_path (str): Full path to the input CSV file.
        table_name (str): Name of the target PostgreSQL table (default is 'lead').

    Returns:
        pd.DataFrame: DataFrame containing data retrieved from the PostgreSQL table,
                      or None if an error occurs.
    """

    try:
        # Step 1: Create SQLAlchemy engine to connect to PostgreSQL
        # Format: "postgresql+psycopg2://<username>:<password>@<host>:<port>/<database>"
        engine = create_engine("postgresql+psycopg2://postgres:Madhu14777@localhost:5432/mydb5")

        # Step 2: Read the CSV file into a DataFrame
        df = pd.read_csv(csv_file_path)
        print(f"✅ Loaded data from CSV file: {csv_file_path}")

        # Step 3: Upload the DataFrame to PostgreSQL
        # - index=False: do not write the DataFrame index as a column
        # - if_exists='replace': drop existing table if it exists and recreate
        df.to_sql(table_name, con=engine, index=False, if_exists='replace')
        print(f"✅ Data uploaded to PostgreSQL table: '{table_name}'")

        # Step 4: Read the table back into a new DataFrame to verify upload
        query = f"SELECT * FROM {table_name}"
        df_from_db = pd.read_sql_query(query, con=engine)
        print(f"✅ Retrieved data from table: '{table_name}'")
        print(df_from_db.head())  # Show preview of ingested data

        return df_from_db

    except Exception as e:
        # Print error details if any step fails
        print("❌ Error during data ingestion or retrieval:")
        print(e)
        return None

# Usage Example
# Ensure PostgreSQL server is running and the credentials are correct
df = data_ingestion("data/Lead Scoring.csv")


✅ Data uploaded to PostgreSQL table 'lead' successfully.
✅ Data retrieved from PostgreSQL table 'lead':
                            Prospect ID  Lead Number              Lead Origin  \
0  7927b2df-8bba-4d29-b9a2-b6e0beafe620       660737                      API   
1  2a272436-5132-4136-86fa-dcc88c88f482       660728                      API   
2  8cc8c611-a219-4f35-ad23-fdfd2656bd8a       660727  Landing Page Submission   
3  0cc2df48-7cf4-4e39-9de9-19797f9b38cc       660719  Landing Page Submission   
4  3256f628-e534-4826-9d63-4a8b88782852       660681  Landing Page Submission   

      Lead Source Do Not Email Do Not Call  Converted  TotalVisits  \
0      Olark Chat           No          No          0          0.0   
1  Organic Search           No          No          0          5.0   
2  Direct Traffic           No          No          1          2.0   
3  Direct Traffic           No          No          0          1.0   
4          Google           No          No          1      

In [8]:
df['Converted'].value_counts(normalize=True) * 100


Converted
0    61.461039
1    38.538961
Name: proportion, dtype: float64

In [34]:
df.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [35]:
df.describe()

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9103.0,9240.0,9103.0,5022.0,5022.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.854853,548.021466,2.161418,1.386694,1.811395
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,15.0
50%,615479.0,0.0,3.0,248.0,2.0,14.0,16.0
75%,637387.25,1.0,5.0,936.0,3.0,15.0,18.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [None]:
# Step: Drop Unwanted or Irrelevant Columns from the Dataset

# Define the list of columns that are not useful for analysis or model training
columns_to_drop = [
    'Prospect ID',                               # Unique identifier, not useful for prediction
    'Lead Number',                               # Same as above
    'Get updates on DM Content',                 # Very low variance or sparse
    'Receive More Updates About Our Courses',    # Redundant/low signal
    'I agree to pay the amount through cheque',  # Low variance or always same
    'Magazine',                                  # Irrelevant feature
    'Update me on Supply Chain Content'          # Sparse or not correlated
]

# Drop these columns in-place (modify the original DataFrame)
df.drop(columns=columns_to_drop, inplace=True)

# Log the shape and preview of the updated DataFrame
print("✅ Dropped unwanted columns.")
print("Updated DataFrame shape:", df.shape)
print(df.head())


               Lead Origin     Lead Source Do Not Email Do Not Call  \
0                      API      Olark Chat           No          No   
1                      API  Organic Search           No          No   
2  Landing Page Submission  Direct Traffic           No          No   
3  Landing Page Submission  Direct Traffic           No          No   
4  Landing Page Submission          Google           No          No   

   Converted  TotalVisits  Total Time Spent on Website  Page Views Per Visit  \
0          0          0.0                            0                   0.0   
1          0          5.0                          674                   2.5   
2          1          2.0                         1532                   2.0   
3          0          1.0                          305                   1.0   
4          1          2.0                         1428                   1.0   

             Last Activity Country  ...                                 Tags  \
0  Page Visi

### We use FunctionTransformer to integrate our custom category mappings into a Scikit-learn pipeline. It helps apply reusable logic, maintain clean code, and ensure consistency across training and prediction.
### This preprocessing step reduces noise, improves model generalization, and keeps the pipeline clean and interpretable.

In [38]:
def map_categorical_columns(df):
    df = df.copy()
    city_map = {
        'Mumbai': 'Metro India', 'Thane & Outskirts': 'Metro India', 'Other Metro Cities': 'Metro India',
        'Other Cities of Maharashtra': 'Tier II India', 'Tier II Cities': 'Tier II India', 'Other Cities': 'Other India',
        'Select': 'Unknown', 'nan': 'Unknown', pd.NA: 'Unknown', None: 'Unknown'
    }
    country_map = {
        'India': 'India', 'United States': 'North America', 'Canada': 'North America',
        'United Arab Emirates': 'Middle East', 'Saudi Arabia': 'Middle East', 'Qatar': 'Middle East', 'Kuwait': 'Middle East',
        'Oman': 'Middle East', 'Bahrain': 'Middle East',
        'Germany': 'Europe', 'France': 'Europe', 'United Kingdom': 'Europe', 'Sweden': 'Europe',
        'Belgium': 'Europe', 'Netherlands': 'Europe', 'Switzerland': 'Europe',
        'China': 'Asia', 'Singapore': 'Asia', 'Hong Kong': 'Asia', 'Philippines': 'Asia', 'Vietnam': 'Asia',
        'South Africa': 'Africa', 'Nigeria': 'Africa',
        'nan': 'Unknown', 'unknown': 'Other'
    }
    specialization_map = {
        'Marketing Management': 'Marketing', 'Operations Management': 'Operations', 'Finance Management': 'Finance',
        'Human Resource Management': 'HR', 'International Business': 'Business', 'Business Administration': 'Business', 'MBA': 'Business',
        'IT Projects Management': 'IT', 'E-Business': 'E-Commerce', 'E-Commerce': 'E-Commerce', 'E Commerce': 'E-Commerce',
        'Supply Chain Management': 'Operations', 'Retail Management': 'Operations', 'Banking, Investment And Insurance': 'Finance',
        'Healthcare Management': 'Healthcare', 'Hospitality Management': 'Healthcare',
        'Rural and Agribusiness': 'Other', 'Travel and Tourism': 'Other', 'Media and Advertising': 'Other', 'Services Excellence': 'Other',
        'Not Specified': 'Unknown', 'Select': 'Unknown', 'nan': 'Unknown', None: 'Unknown'
    }
    education_source_map = {
        'Online Search': 'Digital', 'Advertisements': 'Digital', 'Email': 'Digital', 'SMS': 'Digital', 'Social Media': 'Digital',
        'Word Of Mouth': 'Referral', 'Student of SomeSchool': 'Referral',
        'Multiple Sources': 'Multi-Channel', 'Other': 'Other', 'Select': 'Unknown', 'nan': 'Unknown', None: 'Unknown'
    }
    occupation_map = {
        'Student': 'Student', 'Working Professional': 'Working', 'Businessman': 'Working',
        'Housewife': 'Non-Working', 'Unemployed': 'Non-Working', 'Other': 'Unknown', 'Select': 'Unknown', 'nan': 'Unknown', None: 'Unknown'
    }
    tags_map = {
        'Will revert after reading the email': 'Pending Response', 'Still Thinking': 'Pending Response', 'Interested in full time MBA': 'Pending Response',
        'Ringing': 'Trying to Contact', 'Busy': 'Trying to Contact', 'switched off': 'Trying to Contact', 'opp hangup': 'Trying to Contact',
        'Interested in other courses': 'Not Interested', 'Already a student': 'Not Interested', 'Lost to EINS': 'Not Interested', 'Lost to Others': 'Not Interested', 'Not doing further education': 'Not Interested',
        'invalid number': 'Invalid Contact', 'wrong number given': 'Invalid Contact', 'number not provided': 'Invalid Contact',
        'Diploma holder (Not Eligible)': 'Not Eligible', 'Graduation in progress': 'Not Eligible',
        'Closed by Horizzon': 'Converted', 'Want to take admission but has financial problems': 'Financial Issue', 'in touch with EINS': 'Transferred'
    }
    lead_quality_map = {
        'High in Relevance': 'High', 'Might be': 'Medium', 'Not Sure': 'Medium', 'Low in Relevance': 'Low', 'Worst': 'Low', 'nan': 'Unknown', None: 'Unknown'
    }
    lead_profile_map = {
        'Potential Lead': 'Prospective', 'Other Leads': 'Prospective', 'Student of SomeSchool': 'Converted',
        'Lateral Student': 'Converted', 'Dual Specialization Student': 'Converted', 'Select': 'Unknown', 'nan': 'Unknown', None: 'Unknown'
    }

    mappings = {
        'City': city_map,
        'Country': country_map,
        'Specialization': specialization_map,
        'How did you hear about X Education': education_source_map,
        'What is your current occupation': occupation_map,
        'Tags': tags_map,
        'Lead Quality': lead_quality_map,
        'Lead Profile': lead_profile_map
    }

    for col, mapping in mappings.items():
        if col in df.columns:
            df[col] = df[col].apply(lambda x: mapping.get(x, 'Unknown') if pd.notna(x) else 'Unknown')
            df[col] = df[col].fillna('Unknown')
    return df

mapping_transformer = FunctionTransformer(map_categorical_columns, validate=False)


In [7]:
# Select numerical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64'])

# Calculate skewness
skewness_values = numeric_cols.skew()

# Display skewness
print(skewness_values)

Lead Number                     0.140451
Converted                       0.471058
TotalVisits                    19.911657
Total Time Spent on Website     0.956450
Page Views Per Visit            2.871793
Asymmetrique Activity Score    -0.383380
Asymmetrique Profile Score      0.221739
dtype: float64


### replace_unknowns_with_nan: Replaces string values like "unknown" (case-insensitive) in categorical columns with NaN so they can be treated as missing data.

### handle_skewness: Applies log1p transformation to numerical columns that have high skewness (default threshold = 0.5) to normalize their distribution.

### Both functions are wrapped using FunctionTransformer to integrate smoothly into an ML pipeline for consistent preprocessing.

In [39]:
def replace_unknowns_with_nan(df):
    df = df.copy()
    for col in df.select_dtypes(include=['object', 'category']).columns:
        df[col] = df[col].replace(r'(?i)unknown', np.nan, regex=True)
    return df

def handle_skewness(df, threshold=0.5):
    df = df.copy()
    num_cols = df.select_dtypes(include=np.number).columns
    for col in num_cols:
        if abs(df[col].skew()) > threshold:
            df[col] = df[col].apply(lambda x: np.log1p(x) if pd.notnull(x) and x >= 0 else x)
    return df

replace_unknowns_transformer = FunctionTransformer(replace_unknowns_with_nan, validate=False)
skewness_transformer = FunctionTransformer(handle_skewness, validate=False)


This function constructs a complete Scikit-learn preprocessing pipeline tailored for tabular data that includes both numerical and categorical columns. It prepares the data for machine learning models by handling missing values, encoding, scaling, and applying custom transformations.

Key Steps:
Target Column Exclusion: Removes the target variable "Converted" from feature processing lists.

Label Columns: Certain categorical columns (Lead Quality, Asymmetrique Activity Index, Asymmetrique Profile Index) are encoded using OrdinalEncoder instead of OneHotEncoder.

Pipeline Components:

🔢 num_pipeline: Handles numerical columns with:

Missing value imputation using median.

Feature scaling using MinMaxScaler.

🔤 cat_ohe_pipeline: Handles categorical columns (excluding label-encoded ones) with:

Most frequent value imputation.

One-hot encoding (with unknown handling).

🔢 cat_label_pipeline: Handles specific label columns with:

Most frequent value imputation.

Ordinal encoding (with unknown values handled as -1).

Custom Transformers:

🧹 mapping_transformer: Maps raw text fields to cleaner values.

❓ replace_unknowns_transformer: Replaces "unknown" strings with NaN.

📉 skewness_transformer: Applies log transformation on skewed numerical features.

Output:
Returns a unified Scikit-learn Pipeline object that can be used for both fit and transform operations on the dataset.

In [40]:

def build_full_pipeline(df):
    target_col = "Converted"

    label_cols = ["Lead Quality", "Asymmetrique Activity Index", "Asymmetrique Profile Index"]

    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    if target_col in numeric_cols:
        numeric_cols.remove(target_col)
    if target_col in categorical_cols:
        categorical_cols.remove(target_col)
    if target_col in label_cols:
        label_cols.remove(target_col)

    cat_ohe_cols = [col for col in categorical_cols if col not in label_cols]

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    cat_ohe_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
    ])

    cat_label_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('label', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_pipeline, numeric_cols),
        ('cat_ohe', cat_ohe_pipeline, cat_ohe_cols),
        ('cat_label', cat_label_pipeline, label_cols)
    ], remainder='passthrough')

    full_pipeline = Pipeline([
        ('mapping', mapping_transformer),
        ('replace_unknowns', replace_unknowns_transformer),
        ('skewness', skewness_transformer),
        ('preprocessor', preprocessor)
    ])

    full_pipeline.set_output(transform='default')
    return full_pipeline


''' # ------------------------------------------------------------------------------------
# 🧪 Classification Evaluation Utilities
# These helper functions assist in evaluating the performance of classification models
# by computing key metrics and presenting results clearly.
# ------------------------------------------------------------------------------------

# ✅ evaluate_classification_metrics(y_true, y_pred, y_proba=None, average_type='binary')
# -----------------------------------------------------------------------------
# This function calculates and returns a dictionary of classification metrics:
# - accuracy   : Overall correctness of predictions.
# - precision  : Proportion of predicted positives that are actual positives.
# - recall     : Proportion of actual positives correctly predicted.
# - f1_score   : Harmonic mean of precision and recall.
# - roc_auc    : Area under the ROC curve (computed if y_proba is provided).
#
# Parameters:
# - y_true       : Actual target labels.
# - y_pred       : Predicted labels.
# - y_proba      : Predicted probabilities (optional, for ROC AUC).
# - average_type : Metric averaging method (useful for multi-class settings).
# -----------------------------------------------------------------------------

# ✅ print_classification_report(y_true, y_pred)
# -----------------------------------------------------------------------------
# This function prints a full classification report and a confusion matrix:
# - classification_report : Includes precision, recall, f1-score for each class.
# - confusion_matrix      : Matrix comparing true vs. predicted class labels.
# Useful for quick and informative evaluation output.
# ----------------------------------------------------------------------------- '''


In [None]:


def evaluate_classification_metrics(y_true, y_pred, y_proba=None, average_type='binary'):
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average=average_type, zero_division=0),
        "recall": recall_score(y_true, y_pred, average=average_type, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, average=average_type, zero_division=0)
    }
    if y_proba is not None:
        try:
            metrics["roc_auc"] = roc_auc_score(y_true, y_proba)
        except ValueError:
            metrics["roc_auc"] = None
    else:
        metrics["roc_auc"] = None
    return metrics

def print_classification_report(y_true, y_pred):
    print("\n✅ Classification Report:")
    print(classification_report(y_true, y_pred))
    print("\n✅ Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))


# -----------------------------------------------------------------------------------------------
# 🚂 train_log_and_shap_classification(X_train, y_train, X_val, y_val, preprocessor, ...)
# ------------------------------------------------------------------------------------------------
# This function:
# ✅ Trains multiple classification models using GridSearchCV with preprocessing pipeline
# ✅ Evaluates models on validation set using accuracy, precision, recall, f1, and ROC AUC
# ✅ Saves best models locally using joblib
# ✅ Logs model parameters and metrics to MLflow
# ✅ Generates and logs SHAP summary plots for model explainability
#
# Models included:
# - Logistic Regression
# - Decision Tree
# - Random Forest
# - XGBoost
#
# Parameters:
# - X_train, y_train       : Training feature set and labels
# - X_val, y_val           : Validation feature set and labels
# - preprocessor           : Preprocessing pipeline (e.g., ColumnTransformer)
# - save_dir (str)         : Directory to save best models
# - shap_dir (str)         : Directory to save SHAP plots
#
# Returns:
# - results_df (DataFrame) : Metrics summary for all models
# - best_models (dict)     : Dictionary of best trained models per algorithm
# ------------------------------------------------------------------------------------------------


In [None]:


def train_log_and_shap_classification(
    X_train, y_train, X_val, y_val, preprocessor,
    save_dir="saved_models", shap_dir="shap_outputs"
):
    models = {
        'LogisticRegression': {
            'model': LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42),
            'params': {'C': [0.1, 1.0, 10.0]}
        },
        'DecisionTree': {
            'model': DecisionTreeClassifier(class_weight='balanced', random_state=42),
            'params': {'max_depth': [5, 10, None], 'min_samples_split': [2, 5]}
        },
        'RandomForest': {
            'model': RandomForestClassifier(class_weight='balanced', random_state=42),
            'params': {'n_estimators': [100, 200], 'max_depth': [None, 10]}
        },
        'XGBoost': {
            'model': XGBClassifier(scale_pos_weight=1, use_label_encoder=False, eval_metric='logloss', random_state=42),
            'params': {'n_estimators': [100, 200], 'max_depth': [3, 6]}
        }
    }

    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(shap_dir, exist_ok=True)

    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("LeadScoring_Simplified")

    results = []
    best_models = {}

    for name, model_info in models.items():
        print(f"\n🔧 Training: {name}")

        pipeline = Pipeline([
            ('preprocess', preprocessor),
            ('model', model_info['model'])
        ])

        param_grid = {f"model__{k}": v for k, v in model_info['params'].items()}
        search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )
        search.fit(X_train, y_train)

        y_val_pred = search.predict(X_val)
        y_val_proba = search.predict_proba(X_val)[:, 1] if hasattr(search.best_estimator_.named_steps['model'], "predict_proba") else None

        metrics = evaluate_classification_metrics(y_val, y_val_pred, y_val_proba)
        results.append({"model": name, "best_params": search.best_params_, **metrics})
        best_models[name] = search.best_estimator_

        model_path = os.path.join(save_dir, f"{name}_best_model.pkl")
        joblib.dump(search.best_estimator_, model_path)

        with mlflow.start_run(run_name=name):
            mlflow.log_params(search.best_params_)
            mlflow.log_metrics(metrics)
            mlflow.sklearn.log_model(search.best_estimator_, "model")

            try:
                print(f"🔎 Generating SHAP values for {name}...")
                fitted_preprocessor = search.best_estimator_.named_steps['preprocess']
                X_val_proc = fitted_preprocessor.transform(X_val)
                shap_matrix = X_val_proc.toarray() if hasattr(X_val_proc, "toarray") else X_val_proc
                model_only = search.best_estimator_.named_steps['model']
                if name in ("RandomForest", "XGBoost", "LightGBM", "DecisionTree"):
                    explainer = shap.TreeExplainer(model_only)
                else:
                    explainer = shap.Explainer(model_only, shap_matrix)
                shap_values = explainer(shap_matrix)
                shap_path = os.path.join(shap_dir, f"{name}_shap_summary.png")
                plt.figure()
                shap.summary_plot(shap_values, shap_matrix, show=False)
                plt.savefig(shap_path, bbox_inches='tight')
                plt.close()
                mlflow.log_artifact(shap_path, artifact_path="shap_plots")
                print(f"✅ SHAP saved & logged: {shap_path}")
            except Exception as e:
                print(f"⚠️ SHAP failed for {name}: {e}")

    results_df = pd.DataFrame(results)
    print("\n📊 All Model Validation Metrics:")
    print(results_df[["model", "accuracy", "precision", "recall", "f1_score", "roc_auc"]].to_string(index=False))

    return results_df, best_models


# ------------------------------------------------------------------------------------------------------
# 💾 save_and_register_best_model_pipeline(...)
# ------------------------------------------------------------------------------------------------------
# This function performs the final model selection, saving, and MLflow model registration steps:
#
# ✅ Selects the best model based on highest F1 score from results_df
# ✅ Combines the selected model and preprocessor into a full pipeline
# ✅ Fits the full pipeline on the combined training + validation set
# ✅ Saves both the full pipeline and preprocessing pipeline using joblib
# ✅ Logs the model and artifacts to MLflow and registers it under the best model's name
# ✅ Transitions the newly registered model version to the "Staging" stage in MLflow Model Registry
#
# Parameters:
# - results_df (DataFrame): Evaluation results of all models (must include 'model' and 'f1_score' columns)
# - best_models (dict)    : Dictionary of trained models keyed by model name
# - X_train_val           : Combined training and validation features
# - y_train_val           : Combined training and validation labels
# - preprocessor          : Preprocessing pipeline used before model training
# - save_dir (str)        : Directory where models and pipelines are saved locally
# - experiment_name (str) : MLflow experiment name for logging and tracking
#
# MLflow Requirements:
# - MLflow tracking server must be running locally on port 5000
# - MLflow model registry is used to version and stage the final model
# ------------------------------------------------------------------------------------------------------


In [None]:


def save_and_register_best_model_pipeline(
    results_df, best_models, X_train_val, y_train_val, preprocessor,
    save_dir="saved_models", experiment_name="LeadScoring_Simplified"
):
    os.makedirs(save_dir, exist_ok=True)

    # 1. Select best model
    best_row = results_df.sort_values(by="f1_score", ascending=False).iloc[0]
    best_model_name = best_row["model"]
    best_model = best_models[best_model_name]
    print(f"\n🏆 Best model selected: {best_model_name} (F1 = {best_row['f1_score']:.4f})")

    # 2. Final pipeline
    final_pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("model", best_model.named_steps['model'] if hasattr(best_model, 'named_steps') else best_model)
    ])
    final_pipeline.fit(X_train_val, y_train_val)

    # 3. Save final model pipeline locally
    model_path = os.path.join(save_dir, f"final_{best_model_name}_pipeline.pkl")
    joblib.dump(final_pipeline, model_path)
    print(f"✅ Final pipeline saved at: {model_path}")

    # 4. Save preprocessor pipeline locally
    preprocessor_path = os.path.join(save_dir, "final_preprocessor.pkl")
    joblib.dump(preprocessor, preprocessor_path)
    print(f"✅ Preprocessing pipeline saved at: {preprocessor_path}")

    # 5. Register to MLflow
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment(experiment_name)
    client = MlflowClient()

    with mlflow.start_run(run_name=f"Final_{best_model_name}") as run:
        run_id = run.info.run_id

        # (a) Save local artifacts to MLflow
        mlflow.log_artifact(model_path, artifact_path="model_artifacts")
        mlflow.log_artifact(preprocessor_path, artifact_path="preprocessing_artifacts")

        # (b) Log full pipeline as MLflow model (for loading later)
        mlflow.sklearn.log_model(
            sk_model=final_pipeline,
            artifact_path="sklearn_model",
            registered_model_name=best_model_name
        )

        print(f"🔁 Registering model to Model Registry: {best_model_name}")
        # --- Optionally: Get latest version programmatically (recommended, not just version=1)
        try:
            # List all versions in "None" stage (just logged)
            versions = client.get_latest_versions(name=best_model_name, stages=["None"])
            if versions:
                version = versions[0].version
                client.transition_model_version_stage(
                    name=best_model_name,
                    version=version,
                    stage="Staging",
                    archive_existing_versions=True
                )
                print(f"✅ Model '{best_model_name}' v{version} moved to 'Staging'.")
            else:
                print(f"⚠️ Could not find model version to move to 'Staging'.")
        except Exception as e:
            print(f"⚠️ Transition to 'Staging' failed: {e}")

        print(f"🏃 View run: http://localhost:5000/#/experiments/{run.info.experiment_id}/runs/{run_id}")


In [None]:

def generate_and_log_drift_reports(
    X_train, X_val, X_test,
    feature_names=None,
    output_dir="drift_reports",
    mlflow_uri="http://127.0.0.1:5000",
    experiment_name="Drift"
):
    """
    Generates Evidently Data Drift reports comparing train/val/test,
    saves them as HTML, and logs both artifacts and metrics into MLflow.

    Returns
    -------
    dict
        A dictionary with drift metrics for each comparison.
    """

    # Helper to ensure DataFrame
    def ensure_df(data, feature_names):
        if isinstance(data, pd.DataFrame):
            return data
        cols = feature_names if feature_names is not None else [f"feature_{i}" for i in range(data.shape[1])]
        return pd.DataFrame(data, columns=cols)

    X_train = ensure_df(X_train, feature_names)
    X_val = ensure_df(X_val, feature_names)
    X_test = ensure_df(X_test, feature_names)

    os.makedirs(output_dir, exist_ok=True)

    mlflow.set_tracking_uri(mlflow_uri)
    mlflow.set_experiment(experiment_name)

    comparisons = [
        ("train_vs_val", X_train, X_val),
        ("train_vs_test", X_train, X_test),
        ("val_vs_test", X_val, X_test)
    ]

    results_summary = {}

    with mlflow.start_run(run_name="multi_split_drift") as run:
        for name, ref, curr in comparisons:
            print(f"🚀 Running drift check: {name}")
            report = Report(metrics=[DataDriftPreset()])
            report.run(reference_data=ref, current_data=curr)

            # Save HTML artifact
            html_path = os.path.join(output_dir, f"{name}.html")
            report.save_html(html_path)
            mlflow.log_artifact(html_path, artifact_path="evidently_html_reports")

            json_dict = report.as_dict()
            drift_result = next((m["result"] for m in json_dict["metrics"] if m.get("metric") == "DataDriftTable"), None)

            if drift_result:
                drift_ratio = drift_result.get("share_of_drifted_columns", 0)
                mlflow.log_metric(f"{name}_drift_ratio", round(drift_ratio, 4))

                column_metrics = {}
                for feature, vals in drift_result.get("drift_by_columns", {}).items():
                    score = vals.get("drift_score")
                    if score is not None:
                        clean_name = feature.replace(" ", "_").replace("(", "").replace(")", "")
                        mlflow.log_metric(f"{name}_{clean_name}", round(score, 4))
                        column_metrics[feature] = round(score, 4)

                results_summary[name] = {
                    "drift_ratio": round(drift_ratio, 4),
                    "column_scores": column_metrics
                }

            print(f"✅ Drift metrics for {name} logged to MLflow.\n")

        print(f"🎯 Drift reports & metrics logged under run ID: {run.info.run_id}")

    return results_summary


# ------------------------------------------------------------------------------------------------------
# 📊 generate_and_log_drift_reports(...)
# ------------------------------------------------------------------------------------------------------
# This function automates data drift detection using the Evidently library for multiple dataset splits:
#
# ✅ Compares:
#    - Train vs Validation
#    - Train vs Test
#    - Validation vs Test
# ✅ Generates HTML drift reports using Evidently's DataDriftPreset
# ✅ Saves the reports locally in a specified output directory
# ✅ Logs both the HTML artifacts and drift metrics to MLflow
#
# Parameters:
# - X_train (array-like or DataFrame): Training feature data
# - X_val (array-like or DataFrame): Validation feature data
# - X_test (array-like or DataFrame): Test feature data
# - feature_names (list, optional): List of feature names (used if inputs are arrays)
# - output_dir (str): Directory path to save the HTML drift reports (default: "drift_reports")
# - mlflow_uri (str): MLflow tracking URI (default: "http://127.0.0.1:5000")
# - experiment_name (str): MLflow experiment name to log drift results (default: "Drift")
#
# Returns:
# - dict: Summary of drift results for each pair, including drift ratios and per-feature scores
#
# MLflow Requirements:
# - Must be connected to a local or remote MLflow tracking server
# - Logs HTML artifacts and numerical drift scores as metrics
#
# Example Output:
# {
#     "train_vs_val": {
#         "drift_ratio": 0.25,
#         "column_scores": {"Feature_A": 0.67, "Feature_B": 0.55}
#     },
#     ...
# }
# ------------------------------------------------------------------------------------------------------


In [None]:

def run_lead_prediction_pipeline(
    csv_file_path=None,
    table_name=None,
    experiment_name="LeadScoring_Simplified",
    save_dir="saved_models",
    shap_dir="shap_outputs",
    drift_dir="drift_reports"
):
    # 1. Ingest
    df = data_ingestion(csv_file_path=csv_file_path, table_name="lead")
    if df is None or not isinstance(df, pd.DataFrame):
        raise ValueError("❌ Data ingestion failed: No DataFrame returned.")

    # 2. Drop columns (customize as needed)
    drop_cols = [
        'Prospect ID', 'Lead Number', 'Get updates on DM Content',
        'Receive More Updates About Our Courses', 'I agree to pay the amount through cheque',
        'Magazine', 'Update me on Supply Chain Content'
    ]
    df = df.drop(columns=drop_cols, errors='ignore')

    # 3. Build preprocessor
    preprocessor = build_full_pipeline(df)

    # 4. Train/Val/Test split
    from sklearn.model_selection import train_test_split
    target_col = "Converted"
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
    )

    # 5. Data drift reports
    feature_names = X_train.columns if hasattr(X_train, "columns") else None
    generate_and_log_drift_reports(
        X_train, X_val, X_test,
        feature_names=feature_names,
        output_dir=drift_dir,
        mlflow_uri="http://127.0.0.1:5000",
        experiment_name="Drift"
    )

    # 6. Model training & SHAP
    results_df, best_models = train_log_and_shap_classification(
        X_train, y_train, X_val, y_val, preprocessor,
        save_dir=save_dir, shap_dir=shap_dir
    )

    # 7. Save & Register best model and preprocessor
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])
    save_and_register_best_model_pipeline(
        results_df, best_models, X_train_val, y_train_val, preprocessor,
        save_dir=save_dir, experiment_name=experiment_name
    )

if __name__ == "__main__":
    run_lead_prediction_pipeline(csv_file_path="Lead Scoring.csv")


2025/07/18 21:05:57 INFO mlflow.tracking.fluent: Experiment with name 'Drift' does not exist. Creating a new experiment.


✅ Data uploaded to PostgreSQL table 'lead' successfully.
✅ Data retrieved from PostgreSQL table 'lead':
                            Prospect ID  Lead Number              Lead Origin  \
0  7927b2df-8bba-4d29-b9a2-b6e0beafe620       660737                      API   
1  2a272436-5132-4136-86fa-dcc88c88f482       660728                      API   
2  8cc8c611-a219-4f35-ad23-fdfd2656bd8a       660727  Landing Page Submission   
3  0cc2df48-7cf4-4e39-9de9-19797f9b38cc       660719  Landing Page Submission   
4  3256f628-e534-4826-9d63-4a8b88782852       660681  Landing Page Submission   

      Lead Source Do Not Email Do Not Call  Converted  TotalVisits  \
0      Olark Chat           No          No          0          0.0   
1  Organic Search           No          No          0          5.0   
2  Direct Traffic           No          No          1          2.0   
3  Direct Traffic           No          No          0          1.0   
4          Google           No          No          1      

2025/07/18 21:06:09 INFO mlflow.tracking.fluent: Experiment with name 'LeadScoring_Simplified' does not exist. Creating a new experiment.



🔧 Training: LogisticRegression
Fitting 5 folds for each of 3 candidates, totalling 15 fits
🔎 Generating SHAP values for LogisticRegression...
✅ SHAP saved & logged: shap_outputs\LogisticRegression_shap_summary.png

🔧 Training: DecisionTree
Fitting 5 folds for each of 6 candidates, totalling 30 fits
🔎 Generating SHAP values for DecisionTree...
✅ SHAP saved & logged: shap_outputs\DecisionTree_shap_summary.png

🔧 Training: RandomForest
Fitting 5 folds for each of 4 candidates, totalling 20 fits
🔎 Generating SHAP values for RandomForest...
✅ SHAP saved & logged: shap_outputs\RandomForest_shap_summary.png

🔧 Training: XGBoost
Fitting 5 folds for each of 4 candidates, totalling 20 fits
🔎 Generating SHAP values for XGBoost...
✅ SHAP saved & logged: shap_outputs\XGBoost_shap_summary.png

🔧 Training: LightGBM
Fitting 5 folds for each of 4 candidates, totalling 20 fits
🔎 Generating SHAP values for LightGBM...
✅ SHAP saved & logged: shap_outputs\LightGBM_shap_summary.png

📊 All Model Validation 

Successfully registered model 'RandomForest'.
2025/07/18 21:10:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest, version 1


🔁 Registering model to Model Registry: RandomForest
✅ Model 'RandomForest' v1 moved to 'Staging'.
🏃 View run: http://localhost:5000/#/experiments/958017348571353122/runs/0e6f690eebf04a12bedb0f46038c54ea


Created version '1' of model 'RandomForest'.


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>