In [2]:
import pandas as pd
from preprocessing import preprocess_data
from mlflow_runner import mlflow_run_with_grid_search  
from model_training import evaluate_model

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
import os
import warnings
import logging

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
logging.getLogger("lightgbm").setLevel(logging.ERROR)
logging.getLogger("mlflow").setLevel(logging.ERROR)

DB_HOST = os.getenv("DB_HOST", "localhost")
db_config = {
    "dbname": "sales_conversion",
    "user": "kanikeashritha",
    "password": "ash",
    "host": DB_HOST,
    "port": "5432"
}

def main():
    # from src.shap import explain_model_with_shap

    print("üóÑÔ∏è Loading data from PostgreSQL...")
    df = pd.read_csv("lc.csv")

    def clean_column_names(columns):
        return columns.str.replace(r'[{}[\]<>"\',: ]', '_', regex=True)

    # ‚úÖ Now perform full preprocessing (encoding, imputation etc.)
    print("‚öôÔ∏è Running full preprocessing...")
    X_processed_df, y ,preprocessor,n,c= preprocess_data(df, training=True)
    
# After preprocessing
    df_cleaned = pd.concat([X_processed_df, y], axis=1)

    print("‚úÇÔ∏è Splitting data...")
    X = df_cleaned.drop(columns=["Converted"])
    y = df_cleaned["Converted"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # ‚úÖ Model registry for classification with param grids
    model_registry = {
        "LogisticRegression": (
            LogisticRegression(class_weight='balanced'),
            {"model__C": [0.1, 1.0, 10.0]}
        ),
        "RandomForest": (
            RandomForestClassifier(class_weight='balanced'),
            {"model__n_estimators": [100, 200], "model__max_depth": [5, 10]}
        ),
        "XGBoost": (
            XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
            {"model__n_estimators": [100, 150], "model__learning_rate": [0.05, 0.1]}
        ),
        "LightGBM": (
            LGBMClassifier(verbose=-1),
            {"model__n_estimators": [100, 200], "model__num_leaves": [31, 50]}
        ),
        "DecisionTree": (
            DecisionTreeClassifier(),
            {"model__max_depth": [5, 10, None]}
        )
    }

    for model_name, (model, _) in model_registry.items():
        evaluate_model(X_train, y_train, X_test, y_test, model_name, model)


    mlflow_run_with_grid_search(X_train, X_test, y_train, y_test, model_registry)

    # Optional SHAP explainability:
    # sample_X = df_cleaned.sample(100, random_state=42).drop(columns=["Converted"])
    # explain_model_with_shap(sample_X)

if __name__ == "__main__":
    main()


üóÑÔ∏è Loading data from PostgreSQL...
‚öôÔ∏è Running full preprocessing...
‚úÇÔ∏è Splitting data...

üß† Training model: LogisticRegression
‚úÖ Accuracy:  0.5000
‚úÖ Precision: 0.0000
‚úÖ Recall:    0.0000
‚úÖ F1-score:  0.0000
‚úÖ AUC-ROC:   1.0000

üß† Training model: RandomForest
‚úÖ Accuracy:  0.5000
‚úÖ Precision: 0.0000
‚úÖ Recall:    0.0000
‚úÖ F1-score:  0.0000
‚úÖ AUC-ROC:   1.0000

üß† Training model: XGBoost
‚úÖ Accuracy:  0.5000
‚úÖ Precision: 0.0000
‚úÖ Recall:    0.0000
‚úÖ F1-score:  0.0000
‚úÖ AUC-ROC:   0.5000

üß† Training model: LightGBM
‚úÖ Accuracy:  0.5000
‚úÖ Precision: 0.0000
‚úÖ Recall:    0.0000
‚úÖ F1-score:  0.0000
‚úÖ AUC-ROC:   0.5000

üß† Training model: DecisionTree
‚úÖ Accuracy:  1.0000
‚úÖ Precision: 1.0000
‚úÖ Recall:    1.0000
‚úÖ F1-score:  1.0000
‚úÖ AUC-ROC:   1.0000
üöÄ Training 5 models...

üìä Training LogisticRegression...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
‚úÖ LogisticRegression ‚Äî Acc: 0.5000, Prec: 0.2500, 

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



‚úÖ XGBoost ‚Äî Acc: 0.5000, Prec: 0.2500, Recall: 0.5000, F1: 0.3333
üèÉ View run XGBoost_Classifier at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1/runs/2148f231bf5c468c8e6d74d87d47769c
üß™ View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1

üìä Training LightGBM...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
‚úÖ LightGBM ‚Äî Acc: 0.5000, Prec: 0.2500, Recall: 0.5000, F1: 0.3333
üèÉ View run LightGBM_Classifier at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1/runs/fafc4a4779d74dc48bfd152e91bc5ecd
üß™ View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1

üìä Training DecisionTree...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
‚úÖ DecisionTree ‚Äî Acc: 1.0000, Prec: 1.0000, Recall: 1.0000, F1: 1.0000
üèÉ View run DecisionTree_Classifier at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1/runs/2ec477aac0814d9b8c260dfd378b34d1
üß™ View experiment 