# Fraud Detection: MLOps-Integrated Experimentation Path (Aligned with Original)

This notebook follows the experimentation path for fraud detection models, integrating MLOps principles using Azure ML, MLflow, and DVC.
**Crucially, this version is meticulously aligned with the exact functions, parameters, dates, and hyperparameter grids from the original `model_building_fraud.ipynb` provided.**
It includes:
1. Setup: Connecting to Azure ML, initializing MLflow, defining parameters and DVC paths.
2. Loading Versioned Data (Transformed Features)
3. Defining Training and Test Sets (Temporal Split with Delay - using original dates and functions)
4. Model Selection Process (Prequential Validation with MLflow Tracking - using original grids and functions)
5. Training the Final Model (Logging parameters and artifacts - using original dates and functions)
6. Evaluating and Logging the Final Model Performance (using original dates and functions)
7. Registering the Model in Azure ML

Visualizations and performance comparisons are included, with results logged via MLflow, aiming to replicate the original notebook's results.

## 0. Setup: MLOps Integration & Imports

In [None]:
# --- Core Libraries (from original + MLOps) ---
import os
import pandas as pd
import numpy as np
import math
import sys
import time
import pickle
import json
import datetime
import random
import joblib # For saving/loading model locally

# --- MLOps & Tracking Libraries ---
import mlflow
import mlflow.sklearn # Required for autologging sklearn pipelines
import azureml.core
from azureml.core import Workspace, Experiment, Model, Dataset 
import dvc.api # For potentially reading params/metadata from DVC
import git # For logging git commit

# --- ML Libraries (from original) ---
import sklearn
from sklearn import *
# from sklearn import metrics, preprocessing, model_selection, pipeline, tree, ensemble, linear_model # Be explicit if needed
import imblearn # Used in shared functions, though maybe not in main path
import xgboost

# --- Visualization (from original) ---
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})
import graphviz # For plotting decision trees
import warnings

# --- Settings (from original + MLOps) ---
get_ipython().run_line_magic('matplotlib', 'inline')
warnings.filterwarnings('ignore')
import pkg_resources # For checking azureml-mlflow version

print(f"mlflow version: {mlflow.__version__}")
print(f"azureml-core version: {azureml.core.VERSION}")
try:
    print(f"azureml-mlflow version: {pkg_resources.get_distribution('azureml-mlflow').version}")
except pkg_resources.DistributionNotFound:
    print("azureml-mlflow is NOT installed.")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"xgboost version: {xgboost.__version__}")

# --- Azure ML Workspace Connection ---
try:
    # Assumes config.json is in the specified path or notebook directory
    ws = Workspace.from_config(path='./config.json') # Adjust path if needed
    print(f"Connected to Azure ML Workspace: {ws.name} in {ws.location}")
except Exception as e:
    print(f"Could not load workspace from config.json: {e}")
    print("Ensure config.json is present or Azure ML environment is configured.")
    ws = None

# --- MLflow Integration with Azure ML ---
if ws:
    mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
    print(f"MLflow tracking URI set to Azure ML: {mlflow.get_tracking_uri()}")
else:
    print("WARNING: Azure ML Workspace not connected. MLflow will track locally.")

# --- Define MLflow Experiment ---
# experiment_name = 'Credit_Risk_Fraud_Detection_MLOps_Final'
experiment_name = 'New_Final_Credit_Risk_Fraud_Detection_MLOps'
mlflow.set_experiment(experiment_name)
azure_experiment = Experiment(workspace=ws, name=experiment_name) if ws else None
print(f"MLflow experiment set to: {experiment_name}")

INPUT_DATASET_NAME = "transformed_fraud_data"
input_dataset = None # Initialize dataset variable

print(f"\nAttempting to retrieve Azure ML Dataset '{INPUT_DATASET_NAME}'...")

# Get the LATEST version of the dataset by name
input_dataset = Dataset.get_by_name(workspace=ws, name=INPUT_DATASET_NAME)
print(f"Successfully retrieved dataset:")
print(f"  Name: {input_dataset.name}")
print(f"  Version: {input_dataset.version} (Latest)")
print(f"  ID: {input_dataset.id}")



# --- Define Output Model Information ---
MODEL_OUTPUT_DIR = "outputs/model"
MODEL_NAME_AML = "credit_risk_fraud_model_mlops_Final"
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)



## 0.1 Shared Functions Definition (Copied from Original)

These functions are copied **exactly** from the original `model_building_fraud.ipynb` notebook (cell `ab97f2268d8f4ddea5577c96fa1571f6`) to ensure identical behavior.

In [None]:
# ## Loading and saving data

# ### read_from_files
#
# First use in [Chapter 3, Baseline Feature Transformation](Baseline_Feature_Transformation).

# In[ ]:


# Load a set of pickle files, put them together in a single DataFrame, and order them by time
# It takes as input the folder DIR_INPUT where the files are stored, and the BEGIN_DATE and END_DATE
def read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE):

    # Ensure DIR_INPUT exists
    if not os.path.isdir(DIR_INPUT):
        print(f"ERROR: Input directory not found: {DIR_INPUT}")
        return pd.DataFrame()
        
    try:
        files = [os.path.join(DIR_INPUT, f) for f in os.listdir(DIR_INPUT) if f.endswith('.pkl') and f>=BEGIN_DATE+'.pkl' and f<=END_DATE+'.pkl']
        files.sort() # Sort files chronologically by name
    except FileNotFoundError:
         print(f"ERROR: Input directory not found during listdir: {DIR_INPUT}")
         return pd.DataFrame()
         
    if not files:
        print(f"WARNING: No '.pkl' files found in {DIR_INPUT} for date range {BEGIN_DATE} to {END_DATE}")
        return pd.DataFrame()
        
    print(f"Found {len(files)} files to load.")
    frames = []
    for f in files:
        try:
            df = pd.read_pickle(f)
            # Basic validation after loading each file
            if 'TRANSACTION_ID' not in df.columns:
                 print(f"Warning: TRANSACTION_ID missing in {f}. Skipping file.")
                 continue
            frames.append(df)
        except Exception as e:
            print(f"Error reading file {f}: {e}")
        # del df # Not typically necessary
        
    if not frames:
        print("No dataframes were successfully loaded.")
        return pd.DataFrame()
        
    df_final = pd.concat(frames)

    df_final=df_final.sort_values('TRANSACTION_ID')
    df_final.reset_index(drop=True,inplace=True)
    #  Note: -1 are missing values for real world data
    df_final=df_final.replace([-1],0)
    
    # Ensure TX_DATETIME is datetime type if it exists
    if 'TX_DATETIME' in df_final.columns and not pd.api.types.is_datetime64_any_dtype(df_final['TX_DATETIME']):
        try:
            df_final['TX_DATETIME'] = pd.to_datetime(df_final['TX_DATETIME'])
        except Exception as e:
            print(f"Warning: Could not convert TX_DATETIME to datetime: {e}")

    return df_final


# ### save_object
#

# In[ ]:


#Save oject as pickle file
def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)


# ## Data preprocessing

# ### scaleData
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).
# NOTE: This function is NOT used when using sklearn.pipeline.Pipeline, as the pipeline handles scaling internally during fit/transform.
# It's kept here only for strict adherence to the original notebook's function definitions.

# In[ ]:


def scaleData(train,test,features):
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(train[features])
    # Create copies to avoid modifying original dataframes
    train_scaled = train.copy()
    test_scaled = test.copy()
    train_scaled[features]=scaler.transform(train[features])
    test_scaled[features]=scaler.transform(test[features])

    # Returning the scaler might be useful, though the original didn't explicitly
    # return (train_scaled, test_scaled, scaler) 
    return (train_scaled, test_scaled)


# ## Train/Test splitting strategies

# ### get_train_test_set
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).
# Sampling ratio added in [Chapter 5, Validation Strategies](Validation_Strategies).

# In[ ]:


def get_train_test_set(transactions_df,
                       start_date_training,
                       delta_train=7,delta_delay=7,delta_test=7,
                       sampling_ratio=1.0,
                       random_state=0):
    
    # Validate inputs
    if 'TX_DATETIME' not in transactions_df.columns or 'TX_TIME_DAYS' not in transactions_df.columns or 'CUSTOMER_ID' not in transactions_df.columns or 'TX_FRAUD' not in transactions_df.columns or 'TRANSACTION_ID' not in transactions_df.columns:
        raise ValueError("Missing required columns in transactions_df for get_train_test_set")
    if not isinstance(start_date_training, datetime.datetime):
         raise ValueError("start_date_training must be a datetime object")
         
    # Get the training set data
    train_df = transactions_df[(transactions_df.TX_DATETIME>=start_date_training) &
                               (transactions_df.TX_DATETIME<start_date_training+datetime.timedelta(days=delta_train))]

    # Get the test set data
    test_df = []

    # Note: Cards known to be compromised after the delay period are removed from the test set
    # That is, for each test day, all frauds known at (test_day-delay_period) are removed

    # First, get known defrauded customers from the training set
    known_defrauded_customers = set(train_df[train_df.TX_FRAUD==1].CUSTOMER_ID)

    # Get the relative starting day of training set (easier than TX_DATETIME to collect test data)
    if train_df.empty:
        print(f"Warning: Training period starting {start_date_training.strftime('%Y-%m-%d')} is empty.")
        # Return empty DataFrames matching expected columns
        return (train_df.copy(), pd.DataFrame(columns=transactions_df.columns))
        
    start_tx_time_days_training = train_df.TX_TIME_DAYS.min()

    # Then, for each day of the test set
    for day in range(delta_test):

        # Get test data for that day
        test_day_date = start_tx_time_days_training + delta_train + delta_delay + day
        test_df_day = transactions_df[transactions_df.TX_TIME_DAYS == test_day_date]

        # Compromised cards from that test day, minus the delay period, are added to the pool of known defrauded customers
        # **Correction:** Original notebook used `day-1` relative to `start_tx_time_days_training+delta_train` for delay period check, NOT `test_day_date - delta_delay`
        delay_period_check_day = start_tx_time_days_training + delta_train + day - 1 
        test_df_day_delay_period = transactions_df[transactions_df.TX_TIME_DAYS == delay_period_check_day]

        new_defrauded_customers = set(test_df_day_delay_period[test_df_day_delay_period.TX_FRAUD==1].CUSTOMER_ID)
        known_defrauded_customers = known_defrauded_customers.union(new_defrauded_customers)

        test_df_day = test_df_day[~test_df_day.CUSTOMER_ID.isin(known_defrauded_customers)]

        test_df.append(test_df_day)

    if not test_df:
        print(f"Warning: Test period for training start {start_date_training.strftime('%Y-%m-%d')} resulted in an empty set after filtering.")
        test_df = pd.DataFrame(columns=transactions_df.columns)
    else:
        test_df = pd.concat(test_df)

    # If subsample
    if sampling_ratio<1:

        train_df_frauds=train_df[train_df.TX_FRAUD==1].sample(frac=sampling_ratio, random_state=random_state)
        train_df_genuine=train_df[train_df.TX_FRAUD==0].sample(frac=sampling_ratio, random_state=random_state)
        train_df=pd.concat([train_df_frauds,train_df_genuine])

    # Sort data sets by ascending order of transaction ID
    train_df=train_df.sort_values('TRANSACTION_ID')
    test_df=test_df.sort_values('TRANSACTION_ID')

    return (train_df, test_df)



# In[ ]:

# This function was present in the original shared functions but NOT explicitly called in the main flow of model_building_fraud.ipynb.
# Keeping it here for strict adherence to the shared function definitions.
def get_train_delay_test_set(transactions_df,
                             start_date_training,
                             delta_train=7,delta_delay=7,delta_test=7,
                             sampling_ratio=1.0,
                             random_state=0):

    # Get the training set data
    train_df = transactions_df[(transactions_df.TX_DATETIME>=start_date_training) &
                               (transactions_df.TX_DATETIME<start_date_training+datetime.timedelta(days=delta_train))]

    # Get the delay set data
    delay_df = transactions_df[(transactions_df.TX_DATETIME>=start_date_training+datetime.timedelta(days=delta_train)) &
                               (transactions_df.TX_DATETIME<start_date_training+datetime.timedelta(days=delta_train)+
                                                                               +datetime.timedelta(days=delta_delay))]

    # Get the test set data (reusing logic from get_train_test_set)
    test_df = []
    known_defrauded_customers = set(train_df[train_df.TX_FRAUD==1].CUSTOMER_ID)
    if train_df.empty:
        print(f"Warning (get_train_delay_test_set): Training period starting {start_date_training.strftime('%Y-%m-%d')} is empty.")
        return (train_df.copy(), delay_df.copy(), pd.DataFrame(columns=transactions_df.columns))
        
    start_tx_time_days_training = train_df.TX_TIME_DAYS.min()

    for day in range(delta_test):
        test_day_date = start_tx_time_days_training + delta_train + delta_delay + day
        test_df_day = transactions_df[transactions_df.TX_TIME_DAYS == test_day_date]
        
        delay_period_check_day = start_tx_time_days_training + delta_train + day - 1 
        test_df_day_delay_period = transactions_df[transactions_df.TX_TIME_DAYS == delay_period_check_day]
        
        new_defrauded_customers = set(test_df_day_delay_period[test_df_day_delay_period.TX_FRAUD==1].CUSTOMER_ID)
        known_defrauded_customers = known_defrauded_customers.union(new_defrauded_customers)

        test_df_day = test_df_day[~test_df_day.CUSTOMER_ID.isin(known_defrauded_customers)]
        test_df.append(test_df_day)

    if not test_df:
        test_df = pd.DataFrame(columns=transactions_df.columns)
    else:
        test_df = pd.concat(test_df)

    # If subsample
    if sampling_ratio<1:
        train_df_frauds=train_df[train_df.TX_FRAUD==1].sample(frac=sampling_ratio, random_state=random_state)
        train_df_genuine=train_df[train_df.TX_FRAUD==0].sample(frac=sampling_ratio, random_state=random_state)
        train_df=pd.concat([train_df_frauds,train_df_genuine])

    # Sort data sets by ascending order of transaction ID
    train_df=train_df.sort_values('TRANSACTION_ID')
    test_df=test_df.sort_values('TRANSACTION_ID')

    return (train_df, delay_df, test_df)


# ### prequentialSplit
#
# First use in [Chapter 5, Validation Strategies](Validation_Strategies).

# In[ ]:


def prequentialSplit(transactions_df,
                     start_date_training,
                     n_folds=4,
                     delta_train=7,
                     delta_delay=7,
                     delta_assessment=7):

    prequential_split_indices=[]

    # For each fold
    for fold in range(n_folds):

        # Shift back start date for training by the fold index times the assessment period (delta_assessment)
        # (See Fig. 5)
        start_date_training_fold = start_date_training-datetime.timedelta(days=fold*delta_assessment)

        # Get the training and test (assessment) sets
        (train_df, test_df)=get_train_test_set(transactions_df,
                                               start_date_training=start_date_training_fold,
                                               delta_train=delta_train,delta_delay=delta_delay,delta_test=delta_assessment)

        # Get the indices from the two sets, and add them to the list of prequential splits
        # Check if sets are empty before getting indices
        if not train_df.empty and not test_df.empty:
            indices_train=list(train_df.index)
            indices_test=list(test_df.index)
            prequential_split_indices.append((indices_train,indices_test))
        else:
             print(f"Warning (prequentialSplit): Fold {fold} generated empty train ({train_df.shape}) or test ({test_df.shape}) set for start date {start_date_training_fold.strftime('%Y-%m-%d')}. Skipping fold.")

    if not prequential_split_indices:
        print(f"Warning (prequentialSplit): No valid folds generated for start date {start_date_training.strftime('%Y-%m-%d')} and {n_folds} folds.")
        
    return prequential_split_indices


# ## Predictions functions

# ### fit_model_and_get_predictions
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).
# NOTE: Less used with Pipelines/GridSearchCV, kept for original function adherence.

# In[ ]:


def fit_model_and_get_predictions(classifier, train_df, test_df,
                                  input_features, output_feature="TX_FRAUD",scale=True):

    train_df_processed = train_df.copy()
    test_df_processed = test_df.copy()
    scaler = None # Initialize scaler
    
    # By default, scales input data using the separate scaleData function (as original)
    if scale:
        # scaleData returns tuple (train_scaled, test_scaled)
        (train_df_processed, test_df_processed)=scaleData(train_df_processed, test_df_processed, input_features)
        # If scaleData were modified to return scaler, capture it here
        # (train_df_processed, test_df_processed, scaler) = scaleData(train_df_processed, test_df_processed, input_features)

    # We first train the classifier using the `fit` method, and pass as arguments the input and output features
    start_time=time.time()
    classifier.fit(train_df_processed[input_features], train_df_processed[output_feature])
    training_execution_time=time.time()-start_time

    # We then get the predictions on the training and test data using the `predict_proba` method
    # The predictions are returned as a numpy array, that provides the probability of fraud for each transaction
    start_time=time.time()
    predictions_test=classifier.predict_proba(test_df_processed[input_features])[:,1]
    prediction_execution_time=time.time()-start_time

    predictions_train=classifier.predict_proba(train_df_processed[input_features])[:,1]

    # The result is returned as a dictionary containing the fitted models,
    # and the predictions on the training and test sets
    model_and_predictions_dictionary = {'classifier': classifier,
                                        # 'scaler': scaler, # Only include if scaleData returns it
                                        'predictions_test': predictions_test,
                                        'predictions_train': predictions_train,
                                        'training_execution_time': training_execution_time,
                                        'prediction_execution_time': prediction_execution_time
                                       }

    return model_and_predictions_dictionary


# In[ ]:





# ## Performance assessment

# ### card_precision_top_k_day
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).
# Detailed in [Chapter 4, Precision_top_K_Metrics](Precision_Top_K_Metrics).

# In[ ]:


def card_precision_top_k_day(df_day,top_k):

    # Ensure required columns are present
    if not all(col in df_day.columns for col in ['CUSTOMER_ID', 'predictions', 'TX_FRAUD']):
        print("Warning (card_precision_top_k_day): Missing required columns. Returning empty list and 0.")
        return [], 0.0
        
    # This takes the max of the predictions AND the max of label TX_FRAUD for each CUSTOMER_ID,
    # and sorts by decreasing order of fraudulent prediction
    # Handle empty df_day case
    if df_day.empty:
        return [], 0.0
        
    df_day_grouped = df_day.groupby('CUSTOMER_ID').max().sort_values(by="predictions", ascending=False).reset_index(drop=False)

    # Get the top k most suspicious cards
    df_day_top_k=df_day_grouped.head(top_k)
    list_detected_compromised_cards=list(df_day_top_k[df_day_top_k.TX_FRAUD==1].CUSTOMER_ID)

    # Compute precision top k, handle top_k = 0
    if top_k > 0:
        card_precision_top_k = len(list_detected_compromised_cards) / top_k
    else:
        card_precision_top_k = 0.0

    return list_detected_compromised_cards, card_precision_top_k


# ### card_precision_top_k
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).
# Detailed in [Chapter 4, Precision_top_K_Metrics](Precision_Top_K_Metrics).

# In[ ]:


def card_precision_top_k(predictions_df, top_k, remove_detected_compromised_cards=True):

    # Ensure required columns are present
    if not all(col in predictions_df.columns for col in ['TX_TIME_DAYS', 'CUSTOMER_ID', 'predictions', 'TX_FRAUD']):
        raise ValueError("Missing required columns in predictions_df for card_precision_top_k")
        
    # Sort days by increasing order
    list_days=list(predictions_df['TX_TIME_DAYS'].unique())
    list_days.sort()

    # At first, the list of detected compromised cards is empty
    list_detected_compromised_cards = []

    card_precision_top_k_per_day_list = []
    nb_compromised_cards_per_day = []

    # For each day, compute precision top k
    for day in list_days:

        df_day = predictions_df[predictions_df['TX_TIME_DAYS']==day]
        # Select only necessary columns for the daily calculation
        df_day = df_day[['predictions', 'CUSTOMER_ID', 'TX_FRAUD']]

        # Let us remove detected compromised cards from the set of daily transactions
        df_day = df_day[df_day.CUSTOMER_ID.isin(list_detected_compromised_cards)==False]

        # If df_day is empty after filtering, record 0 and continue
        if df_day.empty:
             nb_compromised_cards_per_day.append(0)
             card_precision_top_k_per_day_list.append(0.0)
             continue
             
        nb_compromised_cards_per_day.append(len(df_day[df_day.TX_FRAUD==1].CUSTOMER_ID.unique()))

        detected_compromised_cards, card_precision_top_k_daily = card_precision_top_k_day(df_day,top_k)

        card_precision_top_k_per_day_list.append(card_precision_top_k_daily)

        # Let us update the list of detected compromised cards
        if remove_detected_compromised_cards:
            list_detected_compromised_cards.extend(detected_compromised_cards)

    # Compute the mean, handle case where list is empty
    if not card_precision_top_k_per_day_list:
        mean_card_precision_top_k = 0.0
    else:
        mean_card_precision_top_k = np.array(card_precision_top_k_per_day_list).mean()

    # Returns precision top k per day as a list, and resulting mean
    return nb_compromised_cards_per_day, card_precision_top_k_per_day_list, mean_card_precision_top_k


# ### card_precision_top_k_custom
#
# First use in [Chapter 5, Validation Strategies](Validation_Strategies).
# This is the scorer function for GridSearchCV.

# In[ ]:


def card_precision_top_k_custom(y_true, y_pred, top_k, transactions_df):

    # Check inputs
    if not isinstance(y_true, pd.Series) or not isinstance(transactions_df, pd.DataFrame):
         print("Warning (CP@k scorer): y_true must be a pandas Series and transactions_df a DataFrame.")
         return 0.0
    if len(y_pred) != len(y_true):
        print("Warning (CP@k scorer): y_pred and y_true have different lengths.")
        return 0.0
    if transactions_df.empty:
         print("Warning (CP@k scorer): transactions_df is empty.")
         return 0.0
         
    # Let us create a predictions_df DataFrame, that contains all transactions matching the indices of the current fold
    # (indices of the y_true vector)
    current_fold_indices = y_true.index
    # Ensure indices are present in the main transaction df
    valid_indices = current_fold_indices.intersection(transactions_df.index)
    if valid_indices.empty:
        print(f"Warning (CP@k scorer): No matching indices found in transactions_df for the current fold ({len(current_fold_indices)} indices).")
        return 0.0
        
    predictions_df=transactions_df.loc[valid_indices].copy()
    
    # Add predictions ensuring alignment with potentially filtered valid_indices
    # Create a Series from y_pred with the original fold indices
    y_pred_series = pd.Series(y_pred, index=current_fold_indices)
    # Select only the predictions corresponding to valid_indices
    predictions_df['predictions'] = y_pred_series.loc[valid_indices]

    # Compute the CP@k using the function implemented in Chapter 4, Section 4.2
    nb_compromised_cards_per_day,card_precision_top_k_per_day_list,mean_card_precision_top_k= \
        card_precision_top_k(predictions_df, top_k)

    # Return the mean_card_precision_top_k
    return mean_card_precision_top_k


# ### performance_assessment
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).

# In[ ]:


def performance_assessment(predictions_df, output_feature='TX_FRAUD',
                           prediction_feature='predictions', top_k_list=[100],
                           rounded=True):
    
    # Ensure required columns exist
    if output_feature not in predictions_df.columns or prediction_feature not in predictions_df.columns:
        raise ValueError(f"Missing required columns ('{output_feature}', '{prediction_feature}') in predictions_df for performance_assessment")

    y_true = predictions_df[output_feature]
    y_pred_proba = predictions_df[prediction_feature]
    
    AUC_ROC = np.nan
    AP = np.nan
    # Check if y_true contains multiple classes before calculating ROC AUC and AP
    if len(y_true.unique()) > 1:
        try:
            AUC_ROC = metrics.roc_auc_score(y_true, y_pred_proba)
            AP = metrics.average_precision_score(y_true, y_pred_proba)
        except ValueError as e:
            print(f"Warning (performance_assessment): ValueError calculating AUC/AP: {e}")
    else:
        print("Warning (performance_assessment): Only one class present in y_true. AUC ROC and Average Precision are not defined.")

    performances = pd.DataFrame([[AUC_ROC, AP]],
                           columns=['AUC ROC','Average precision'])

    # Add CP@k metric
    for top_k in top_k_list:
        # Check if columns required by card_precision_top_k are present
        cpk_cols_present = all(col in predictions_df.columns for col in ['TX_TIME_DAYS', 'CUSTOMER_ID'])
        if cpk_cols_present:
            try:
                _, _, mean_card_precision_top_k = card_precision_top_k(predictions_df, top_k)
                performances['Card Precision@'+str(top_k)]=mean_card_precision_top_k
            except Exception as e:
                 print(f"Warning (performance_assessment): Error calculating CP@{top_k}: {e}")
                 performances['Card Precision@'+str(top_k)] = np.nan
        else:
             print(f"Warning (performance_assessment): Skipping CP@{top_k} calculation due to missing columns (TX_TIME_DAYS, CUSTOMER_ID).")
             performances['Card Precision@'+str(top_k)] = np.nan
             
    if rounded:
        performances = performances.round(3)

    return performances


# ### performance_assessment_model_collection
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).
# NOTE: Less relevant with MLflow logging, kept for original function adherence.

# In[ ]:


def performance_assessment_model_collection(fitted_models_and_predictions_dictionary,
                                            transactions_df,
                                            type_set='test',
                                            top_k_list=[100]):

    performances=pd.DataFrame()

    for classifier_name, model_and_predictions in fitted_models_and_predictions_dictionary.items():

        # Make a copy to avoid modifying original df
        predictions_df=transactions_df.copy()
        pred_key = 'predictions_'+type_set
        
        if pred_key not in model_and_predictions:
             print(f"Warning (perf_assess_collection): Predictions '{pred_key}' not found for '{classifier_name}'. Skipping.")
             continue
             
        predictions_df['predictions']=model_and_predictions[pred_key]

        # Call the main performance assessment function
        try:
            performances_model=performance_assessment(predictions_df, output_feature='TX_FRAUD',
                                                    prediction_feature='predictions', top_k_list=top_k_list)
            performances_model.index=[classifier_name]
            # Use concat instead of append
            performances=pd.concat([performances, performances_model])
        except ValueError as e:
             print(f"Warning (perf_assess_collection): Error assessing '{classifier_name}' ({type_set}): {e}")
        except Exception as e:
             print(f"Warning (perf_assess_collection): Unexpected error assessing '{classifier_name}' ({type_set}): {e}")
             
    return performances


# ### execution_times_model_collection
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).
# NOTE: Less relevant with MLflow logging, kept for original function adherence.

# In[ ]:


def execution_times_model_collection(fitted_models_and_predictions_dictionary):

    execution_times=pd.DataFrame()

    for classifier_name, model_and_predictions in fitted_models_and_predictions_dictionary.items():

        execution_times_model=pd.DataFrame()
        execution_times_model['Training execution time']=[model_and_predictions.get('training_execution_time', np.nan)]
        execution_times_model['Prediction execution time']=[model_and_predictions.get('prediction_execution_time', np.nan)]
        execution_times_model.index=[classifier_name]

        # Use concat instead of append
        execution_times=pd.concat([execution_times, execution_times_model])

    return execution_times


# ### get_class_from_fraud_probability
#
# First use in [Chapter 4, Threshold Based Metrics](Threshold_Based_Metrics).

# In[ ]:


# Getting classes from a vector of fraud probabilities and a threshold
def get_class_from_fraud_probability(fraud_probabilities, threshold=0.5):

    predicted_classes = [0 if fraud_probability<threshold else 1
                         for fraud_probability in fraud_probabilities]

    return predicted_classes


# ### threshold_based_metrics
#
# First use in [Chapter 4, Threshold Based Metrics](Threshold_Based_Metrics).

# In[ ]:


def threshold_based_metrics(fraud_probabilities, true_label, thresholds_list):

    results = []

    for threshold in thresholds_list:

        predicted_classes = get_class_from_fraud_probability(fraud_probabilities, threshold=threshold)

        try:
            # Use labels=[0, 1] to ensure matrix structure even if one class isn't predicted
            cm = metrics.confusion_matrix(true_label, predicted_classes, labels=[0, 1])
            TN, FP, FN, TP = cm.ravel()
        except ValueError as e:
            print(f"Warning (threshold_metrics): Could not calculate confusion matrix for threshold {threshold}: {e}")
            TN, FP, FN, TP = 0, 0, 0, 0 # Default to zeros

        # Calculate metrics, handling potential division by zero
        total_pop = TN+FP+FN+TP
        MME = (FP+FN)/total_pop if total_pop > 0 else 0

        TPR = TP/(TP+FN) if (TP+FN) > 0 else 0
        TNR = TN/(TN+FP) if (TN+FP) > 0 else 0

        FPR = FP/(TN+FP) if (TN+FP) > 0 else 0
        FNR = FN/(TP+FN) if (TP+FN) > 0 else 0

        BER = 1/2*(FPR+FNR)

        Gmean = np.sqrt(TPR*TNR)

        precision = TP/(TP+FP) if (TP+FP) > 0 else 0 # Original had 1, changed to 0 for consistency
        FDR = FP/(TP+FP) if (TP+FP) > 0 else 0 # Original had 1, changed to 0

        NPV = TN/(TN+FN) if (TN+FN) > 0 else 0 # Original had 1, changed to 0
        FOR = FN/(TN+FN) if (TN+FN) > 0 else 0 # Original had 1, changed to 0

        F1_score = 2*(precision*TPR)/(precision+TPR) if (precision+TPR) > 0 else 0

        results.append([threshold, MME, TPR, TNR, FPR, FNR, BER, Gmean, precision, NPV, FDR, FOR, F1_score])

    results_df = pd.DataFrame(results,columns=['Threshold' ,'MME', 'TPR', 'TNR', 'FPR', 'FNR', 'BER', 'G-mean', 'Precision', 'NPV', 'FDR', 'FOR', 'F1 Score'])

    return results_df


# In[ ]:


# === Summary and Plotting Functions (Copied from Original) ===
def get_summary_performances(performances_df, parameter_column_name="Parameters summary"):
    metrics_list = ['AUC ROC','Average precision','Card Precision@100'] # Assuming CP@100 was used
    performances_results=pd.DataFrame(columns=metrics_list)
    if performances_df.empty:
         print("Warning: Empty performance dataframe passed to get_summary_performances.")
         # Return structure with N/A
         na_vals = ['N/A'] * len(metrics_list)
         performances_results.loc["Best estimated parameters"]=na_vals
         performances_results.loc["Validation performance"]=na_vals
         performances_results.loc["Test performance"]=na_vals
         performances_results.loc["Optimal parameter(s)"]=na_vals
         performances_results.loc["Optimal test performance"]=na_vals
         return performances_results

    performances_df.reset_index(drop=True,inplace=True)
    best_estimated_parameters = []
    validation_performance = []
    test_performance = []

    for metric in metrics_list:
        val_metric_col = metric+' Validation'
        val_std_col = val_metric_col+' Std'
        test_metric_col = metric+' Test'
        test_std_col = test_metric_col+' Std'

        # Check if columns exist
        if val_metric_col not in performances_df.columns or test_metric_col not in performances_df.columns:
             print(f"Warning: Missing columns for metric {metric} in performance dataframe.")
             best_estimated_parameters.append('N/A')
             validation_performance.append('N/A')
             test_performance.append('N/A')
             continue

        # Handle potential NaNs from GridSearch failures
        valid_performances = pd.to_numeric(performances_df[val_metric_col], errors='coerce')
        if valid_performances.isna().all():
            print(f"Warning: All validation scores for metric {metric} are NaN.")
            # Find index with max Test score if validation failed (or default to 0)
            test_scores_for_fallback = pd.to_numeric(performances_df[test_metric_col], errors='coerce')
            if test_scores_for_fallback.isna().all():
                index_best_validation_performance = 0 # Default index if test also NaN
            else:
                 index_best_validation_performance = test_scores_for_fallback.idxmax()
            best_param = performances_df[parameter_column_name].iloc[index_best_validation_performance] if parameter_column_name in performances_df.columns else 'N/A'
            val_perf_str = 'NaN'
            # Get test perf at this index
            test_perf_val = test_scores_for_fallback.iloc[index_best_validation_performance]
            test_std_val = pd.to_numeric(performances_df.get(test_std_col, 0.0), errors='coerce').iloc[index_best_validation_performance]
            test_perf_str = f"{test_perf_val:.3f} +/- {test_std_val:.2f}" if not pd.isna(test_perf_val) else 'NaN'
        else:
            index_best_validation_performance = valid_performances.idxmax()
            best_param = performances_df[parameter_column_name].iloc[index_best_validation_performance] if parameter_column_name in performances_df.columns else 'N/A'
            val_perf = valid_performances.iloc[index_best_validation_performance]
            val_std = pd.to_numeric(performances_df.get(val_std_col, 0.0), errors='coerce').iloc[index_best_validation_performance]
            test_perf = pd.to_numeric(performances_df[test_metric_col], errors='coerce').iloc[index_best_validation_performance]
            test_std = pd.to_numeric(performances_df.get(test_std_col, 0.0), errors='coerce').iloc[index_best_validation_performance]

            val_perf_str = f"{val_perf:.3f} +/- {val_std:.2f}" if not pd.isna(val_perf) else 'NaN'
            test_perf_str = f"{test_perf:.3f} +/- {test_std:.2f}" if not pd.isna(test_perf) else 'NaN'

        best_estimated_parameters.append(best_param)
        validation_performance.append(val_perf_str)
        test_performance.append(test_perf_str)

    performances_results.loc["Best estimated parameters"]=best_estimated_parameters
    performances_results.loc["Validation performance"]=validation_performance
    performances_results.loc["Test performance"]=test_performance

    # Optimal on Test Set (similar logic)
    optimal_test_performance = []
    optimal_parameters = []
    for metric_base in metrics_list:
        test_metric_col = metric_base+' Test'
        test_std_col = test_metric_col+' Std'

        if test_metric_col not in performances_df.columns:
            optimal_parameters.append('N/A')
            optimal_test_performance.append('N/A')
            continue

        test_performances = pd.to_numeric(performances_df[test_metric_col], errors='coerce')
        if test_performances.isna().all():
            print(f"Warning: All test scores for metric {metric_base} are NaN.")
            index_optimal_test_performance = 0
            opt_param = performances_df[parameter_column_name].iloc[index_optimal_test_performance] if parameter_column_name in performances_df.columns else 'N/A'
            opt_test_perf_str = 'NaN'
        else:
            index_optimal_test_performance = test_performances.idxmax()
            opt_param = performances_df[parameter_column_name].iloc[index_optimal_test_performance] if parameter_column_name in performances_df.columns else 'N/A'
            opt_test_perf = test_performances.iloc[index_optimal_test_performance]
            opt_test_std = pd.to_numeric(performances_df.get(test_std_col, 0.0), errors='coerce').iloc[index_optimal_test_performance]
            opt_test_perf_str = f"{opt_test_perf:.3f} +/- {opt_test_std:.2f}" if not pd.isna(opt_test_perf) else 'NaN'

        optimal_parameters.append(opt_param)
        optimal_test_performance.append(opt_test_perf_str)

    performances_results.loc["Optimal parameter(s)"]=optimal_parameters
    performances_results.loc["Optimal test performance"]=optimal_test_performance

    return performances_results


# ### model_selection_performances
#
# First use in [Chapter 5, Model Selection](Model_Selection).

# In[ ]:


def model_selection_performances(performances_df_dictionary,
                                 performance_metric='AUC ROC',
                                 model_classes=['Decision Tree', 'Logistic Regression', 'Random Forest', 'XGBoost'],
                                 # ** ALIGNED WITH ORIGINAL NOTEBOOK DEFAULTS USED IN COMPARISON PLOT **
                                 default_parameters_dictionary={
                                     "Decision Tree": 'max_depth=None, random_state=0', # Original compared against max_depth=None
                                     "Logistic Regression": 'C=1.0, random_state=0', # Original compared against C=1.0
                                     "Random Forest": "max_depth=None, n_estimators=100, n_jobs=1, random_state=0", # Original compared against these
                                     "XGBoost": "eval_metric=logloss, learning_rate=0.3, max_depth=6, n_estimators=100, n_jobs=1, random_state=0, use_label_encoder=False, verbosity=0" # Original compared against these
                                 }):
    # Helper to create comparable string representation from dict (used internally)
    def params_dict_to_str_for_lookup(params_dict):
        # Filter out 'clf__' prefix and sort for consistency
        # Make sure to include all params defined in default_parameters_dictionary
        items = [f"{k.split('__')[1]}={v}" for k, v in sorted(params_dict.items())]
        return ", ".join(items)

    # Convert default parameters dictionary values (which are strings) into a comparable format
    # by parsing the string and creating a sorted string, similar to how 'Parameters summary' is created
    def parse_default_str_to_comparable_str(param_str):
         try:
             # Basic parsing, assumes 'key=value' separated by ', '
             items = sorted([item.strip() for item in param_str.split(',')])
             return ", ".join(items)
         except:
             return param_str # Fallback if parsing fails
             
    default_strings_comparable = {model: parse_default_str_to_comparable_str(params)
                                for model, params in default_parameters_dictionary.items()}


    mean_performances_dictionary={"Default parameters": [],"Best validation parameters": [],"Optimal parameters": []}
    std_performances_dictionary={"Default parameters": [],"Best validation parameters": [],"Optimal parameters": []}

    for model_class in model_classes:
        if model_class not in performances_df_dictionary or performances_df_dictionary[model_class].empty:
             print(f"Warning (model_sel_perf): No performance data for {model_class}. Skipping.")
             for key in mean_performances_dictionary:
                 mean_performances_dictionary[key].append(np.nan)
                 std_performances_dictionary[key].append(np.nan)
             continue

        performances_df=performances_df_dictionary[model_class]
        if 'Parameters summary' not in performances_df.columns or 'Parameters' not in performances_df.columns:
             print(f"Warning (model_sel_perf): Missing 'Parameters summary' or 'Parameters' column for {model_class}. Skipping default param lookup.")
             mean_performances_dictionary["Default parameters"].append(np.nan)
             std_performances_dictionary["Default parameters"].append(np.nan)
        else:
            # Use the generated string summary for matching defaults
            default_param_comparable_str = default_strings_comparable.get(model_class, 'N/A')
            # Create comparable string from 'Parameters summary' for matching
            performances_df['Parameters summary comparable'] = performances_df['Parameters summary'].apply(parse_default_str_to_comparable_str)
            default_performances = performances_df[performances_df['Parameters summary comparable'] == default_param_comparable_str]

            if not default_performances.empty:
                default_perf = pd.to_numeric(default_performances[performance_metric+" Test"], errors='coerce').values[0]
                default_std = pd.to_numeric(default_performances[performance_metric+" Test Std"], errors='coerce').values[0]
                mean_performances_dictionary["Default parameters"].append(default_perf)
                std_performances_dictionary["Default parameters"].append(default_std)
            else:
                print(f"Warning (model_sel_perf): Default parameters comparable string '{default_param_comparable_str}' not found for {model_class} in 'Parameters summary comparable'. Appending NaN.")
                # Optional: Print available comparable strings for debugging
                # print("Available comparable summaries:", performances_df['Parameters summary comparable'].unique())
                mean_performances_dictionary["Default parameters"].append(np.nan)
                std_performances_dictionary["Default parameters"].append(np.nan)

        # Get best validation and optimal parameters using the summary function
        performances_summary = get_summary_performances(performances_df, parameter_column_name="Parameters summary")
        if performance_metric in performances_summary.columns:
            # Best validation parameters -> Test performance
            mean_std_test = performances_summary.loc["Test performance", performance_metric]
            if isinstance(mean_std_test, str) and '+/-' in mean_std_test:
                try:
                    mean_val, std_val = map(float, mean_std_test.split("+/-"))
                    mean_performances_dictionary["Best validation parameters"].append(mean_val)
                    std_performances_dictionary["Best validation parameters"].append(std_val)
                except ValueError:
                    mean_performances_dictionary["Best validation parameters"].append(np.nan)
                    std_performances_dictionary["Best validation parameters"].append(np.nan)
            else: # Handle 'N/A' or 'NaN'
                mean_performances_dictionary["Best validation parameters"].append(np.nan)
                std_performances_dictionary["Best validation parameters"].append(np.nan)

            # Optimal test parameters -> Test performance
            mean_std_opt = performances_summary.loc["Optimal test performance", performance_metric]
            if isinstance(mean_std_opt, str) and '+/-' in mean_std_opt:
                try:
                    mean_opt, std_opt = map(float, mean_std_opt.split("+/-"))
                    mean_performances_dictionary["Optimal parameters"].append(mean_opt)
                    std_performances_dictionary["Optimal parameters"].append(std_opt)
                except ValueError:
                    mean_performances_dictionary["Optimal parameters"].append(np.nan)
                    std_performances_dictionary["Optimal parameters"].append(np.nan)
            else: # Handle 'N/A' or 'NaN'
                mean_performances_dictionary["Optimal parameters"].append(np.nan)
                std_performances_dictionary["Optimal parameters"].append(np.nan)
        else:
             print(f"Warning (model_sel_perf): Metric '{performance_metric}' not found in summary columns for {model_class}.")
             mean_performances_dictionary["Best validation parameters"].append(np.nan)
             std_performances_dictionary["Best validation parameters"].append(np.nan)
             mean_performances_dictionary["Optimal parameters"].append(np.nan)
             std_performances_dictionary["Optimal parameters"].append(np.nan)

    return (mean_performances_dictionary,std_performances_dictionary)


# ## Model selection

# ### prequential_grid_search
#
# First use in [Chapter 5, Validation Strategies](Validation_Strategies).

# In[ ]:


# ### prequential_grid_search
#
# First use in [Chapter 5, Validation Strategies](Validation_Strategies).

# In[ ]:
def prequentialSplit_with_dates(transactions_df,
                                start_date_training,
                                n_folds=4,
                                delta_train=7,
                                delta_delay=7,
                                delta_assessment=7):
    """
    Generates prequential splits, returning indices and printing date ranges for each fold.

    Args:
        transactions_df (pd.DataFrame): DataFrame with transaction data (must have index and date info).
        start_date_training (datetime.datetime): The *latest* training start date
                                                 (used for fold 0). Folds go back in time.
        n_folds (int): Number of folds.
        delta_train (int): Duration of the training period in days.
        delta_delay (int): Duration of the delay period in days.
        delta_assessment (int): Duration of the assessment (test) period in days.

    Returns:
        list: A list of tuples, where each tuple contains (indices_train, indices_test)
              for a fold. Matches the original return type for compatibility with GridSearchCV.
              Returns an empty list if no valid folds are generated.
        Prints: Detailed date ranges for each fold's train, delay, and test periods.
    """
    prequential_split_indices = []
    print(f"\n--- Generating Prequential Folds (n_folds={n_folds}) ---")
    print(f"Base Start Date (Fold 0 Train Start): {start_date_training.strftime('%Y-%m-%d')}")
    print(f"Deltas: Train={delta_train}, Delay={delta_delay}, Assessment={delta_assessment}")
    print("-" * 60)

    # For each fold
    for fold in range(n_folds):
        # Shift back start date for training by the fold index times the assessment period
        start_date_training_fold = start_date_training - datetime.timedelta(days=fold * delta_assessment)

        # Calculate all date boundaries for this fold
        # End dates represent the start of the *next* period (exclusive end)
        end_date_training_fold = start_date_training_fold + datetime.timedelta(days=delta_train)
        start_date_delay_fold = end_date_training_fold
        end_date_delay_fold = start_date_delay_fold + datetime.timedelta(days=delta_delay)
        start_date_test_fold = end_date_delay_fold
        end_date_test_fold = start_date_test_fold + datetime.timedelta(days=delta_assessment)

        # Calculate inclusive end dates for printing clarity
        inclusive_end_train = end_date_training_fold - datetime.timedelta(days=1)
        inclusive_end_delay = end_date_delay_fold - datetime.timedelta(days=1)
        inclusive_end_test = end_date_test_fold - datetime.timedelta(days=1)

        print(f"Fold {fold}:")
        print(f"  Train Period: {start_date_training_fold.strftime('%Y-%m-%d')} to {inclusive_end_train.strftime('%Y-%m-%d')} ({delta_train} days)")
        print(f"  Delay Period: {start_date_delay_fold.strftime('%Y-%m-%d')} to {inclusive_end_delay.strftime('%Y-%m-%d')} ({delta_delay} days)")
        print(f"  Test Period:  {start_date_test_fold.strftime('%Y-%m-%d')} to {inclusive_end_test.strftime('%Y-%m-%d')} ({delta_assessment} days)")

        # Get the training and test (assessment) sets using the original function logic
        # This function uses the start dates and deltas to select the correct data slices
        try:
            (train_df, test_df) = get_train_test_set(transactions_df,
                                                   start_date_training=start_date_training_fold,
                                                   delta_train=delta_train,
                                                   delta_delay=delta_delay,
                                                   delta_test=delta_assessment)
        except Exception as e:
            print(f"  -> ERROR calling get_train_test_set for fold {fold}: {e}")
            print(f"     Skipping fold {fold}.")
            print("-" * 10)
            continue # Skip to next fold

        # Get the indices from the two sets, and add them to the list of prequential splits
        # Check if sets are empty before getting indices
        if not train_df.empty and not test_df.empty:
            indices_train = list(train_df.index)
            indices_test = list(test_df.index)
            prequential_split_indices.append((indices_train, indices_test))
            print(f"  -> Train size: {len(indices_train)}, Test size: {len(indices_test)}. Added fold indices.")
        else:
             # Use the warning from the original user code
             print(f"  -> Warning (prequentialSplit): Fold {fold} generated empty train ({train_df.shape}) or test ({test_df.shape}) set for start date {start_date_training_fold.strftime('%Y-%m-%d')}. Skipping fold.")
        print("-" * 10) # Separator between folds

    if not prequential_split_indices:
        # Use the warning from the original user code
        print(f"Warning (prequentialSplit): No valid folds generated for start date {start_date_training.strftime('%Y-%m-%d')} and {n_folds} folds.")

    print("--- Finished Generating Prequential Folds ---")
    # Return the original format (list of tuples of indices) for compatibility
    return prequential_split_indices
def prequential_grid_search(transactions_df,
                            classifier,
                            input_features, output_feature,
                            parameters, scoring,
                            start_date_training,
                            n_folds=4,
                            expe_type='Test',
                            delta_train=7,
                            delta_delay=7,
                            delta_assessment=7,
                            performance_metrics_list_grid=['roc_auc'],
                            performance_metrics_list=['AUC ROC'],
                            n_jobs=-1):

    # Input validation
    if transactions_df.empty:
         print(f"ERROR (prequential_grid_search): Input transactions_df is empty for {expe_type}.")
         # Return empty DataFrame matching expected structure
         cols = [f'{m} {expe_type}' for m in performance_metrics_list] + \
                [f'{m} {expe_type} Std' for m in performance_metrics_list] + \
                ['Parameters', 'Parameters summary', 'Execution time']
         return pd.DataFrame(columns=cols)
    if not scoring:
         print(f"ERROR (prequential_grid_search): scoring dictionary is empty for {expe_type}.")
         return pd.DataFrame(columns=cols) # Return empty using same structure

    estimators = [('scaler', sklearn.preprocessing.StandardScaler()), ('clf', classifier)]
    pipe = sklearn.pipeline.Pipeline(estimators)

    # prequential_split_indices=prequentialSplit(transactions_df,
    #                                            start_date_training=start_date_training,
    #                                            n_folds=n_folds,
    #                                            delta_train=delta_train,
    #                                            delta_delay=delta_delay,
    #                                            delta_assessment=delta_assessment)
    prequential_split_indices = prequentialSplit_with_dates(transactions_df,
                                                        start_date_training=start_date_training,
                                                        n_folds=n_folds,
                                                        delta_train=delta_train,
                                                        delta_delay=delta_delay,
                                                        delta_assessment=delta_assessment)

    # If no valid splits, return empty df
    if not prequential_split_indices:
         print(f"ERROR (prequential_grid_search): No valid prequential splits for {expe_type} starting {start_date_training.strftime('%Y-%m-%d')}. Cannot run GridSearchCV.")
         # Return empty DataFrame matching expected structure
         cols = [f'{m} {expe_type}' for m in performance_metrics_list] + \
                [f'{m} {expe_type} Std' for m in performance_metrics_list] + \
                ['Parameters', 'Parameters summary', 'Execution time']
         return pd.DataFrame(columns=cols)

    # Use refit=False as per original, we only care about CV results here
    grid_search = sklearn.model_selection.GridSearchCV(pipe, parameters, scoring=scoring, cv=prequential_split_indices, refit=False, n_jobs=n_jobs, return_train_score=False)

    X=transactions_df[input_features]
    y=transactions_df[output_feature]

    # >>> REMOVED THE EXPLICIT NaN CHECK BLOCK FOR X_fit <<<
    # The original notebook relied on the pipeline's scaler to handle this implicitly (or fail).

    print(f"Starting GridSearchCV for {expe_type} set (Classifier: {classifier.__class__.__name__})...")
    try:
        # Fit directly on X, pipeline handles scaling.
        # StandardScaler will raise ValueError if NaNs are present during fit.
        grid_search.fit(X, y)
    except ValueError as ve:
         if 'Input contains NaN' in str(ve):
              print(f"ERROR (prequential_grid_search): GridSearchCV fit failed for {expe_type} due to NaNs in input data. StandardScaler cannot fit NaNs. Original notebook might have implicitly handled this earlier or used NaN-free data for this step.")
              # You might want to add imputation here *if* you know the original data was clean or if you want to proceed despite NaNs
              # Example imputation (use cautiously, deviates from strict original logic):
              # print("Attempting imputation with mean...")
              # X_imputed = sklearn.impute.SimpleImputer(strategy='mean').fit_transform(X)
              # grid_search.fit(X_imputed, y)
              return pd.DataFrame(columns=cols) # Return empty if fit fails
         else:
              print(f"ERROR (prequential_grid_search): GridSearchCV fit failed for {expe_type}: {ve}")
              import traceback
              traceback.print_exc()
              return pd.DataFrame(columns=cols) # Return empty if fit fails
    except Exception as e:
        print(f"ERROR (prequential_grid_search): GridSearchCV fit failed unexpectedly for {expe_type}: {e}")
        import traceback
        traceback.print_exc()
        # Return empty df if fit fails
        return pd.DataFrame(columns=cols)

    print(f"Finished GridSearchCV for {expe_type} set.")

    performances_df=pd.DataFrame()

    # Extract results using grid keys, checking existence
    cv_results = grid_search.cv_results_
    for i in range(len(performance_metrics_list_grid)):
        grid_key = performance_metrics_list_grid[i]
        display_name = performance_metrics_list[i] # Assumes lists align
        mean_score_key = f'mean_test_{grid_key}'
        std_score_key = f'std_test_{grid_key}'

        if mean_score_key in cv_results:
            performances_df[f'{display_name} {expe_type}'] = cv_results[mean_score_key]
        else:
            print(f"Warning: Mean score key '{mean_score_key}' not found in cv_results_ for {expe_type}.")
            performances_df[f'{display_name} {expe_type}'] = np.nan

        if std_score_key in cv_results:
            performances_df[f'{display_name} {expe_type} Std'] = cv_results[std_score_key]
        else:
            print(f"Warning: Std score key '{std_score_key}' not found in cv_results_ for {expe_type}.")
            performances_df[f'{display_name} {expe_type} Std'] = np.nan

    if 'params' in cv_results:
        performances_df['Parameters']=cv_results['params']
        # Helper function to convert parameter dict to a readable string summary
        def params_to_str(params):
            # Ensure params is a dict
            if not isinstance(params, dict):
                return str(params)
            try:
                 # Filter out 'clf__' prefix and sort for consistency
                 items = [f"{k.split('__')[1]}={v}" for k, v in sorted(params.items())]
                 return ", ".join(items)
            except Exception:
                 return str(params) # Fallback
        performances_df['Parameters summary'] = performances_df['Parameters'].apply(params_to_str)
    else:
        print("Warning: 'params' key not found in cv_results_. Cannot add parameter columns.")
        # Add empty/NA columns to maintain structure
        performances_df['Parameters'] = [{} for _ in range(len(performances_df))] if not performances_df.empty else []
        performances_df['Parameters summary'] = 'N/A'

    if 'mean_fit_time' in cv_results:
        performances_df['Execution time']=cv_results['mean_fit_time']
    else:
        print("Warning: 'mean_fit_time' key not found in cv_results_.")
        performances_df['Execution time'] = np.nan

    return performances_df



# ### model_selection_wrapper
#
# First use in [Chapter 5, Model Selection](Model_Selection).

# In[ ]:


def model_selection_wrapper(transactions_df,
                            classifier,
                            input_features, output_feature,
                            parameters,
                            scoring,
                            start_date_training_for_valid,
                            start_date_training_for_test,
                            n_folds=4,
                            delta_train=7,
                            delta_delay=7,
                            delta_assessment=7,
                            performance_metrics_list_grid=['roc_auc'],
                            performance_metrics_list=['AUC ROC'],
                            n_jobs=-1):

    # Get performances on the validation set using prequential validation
    print("--- Running Prequential Grid Search for Validation Set ---")
    performances_df_validation=prequential_grid_search(transactions_df, classifier,
                            input_features, output_feature,
                            parameters, scoring,
                            start_date_training=start_date_training_for_valid,
                            n_folds=n_folds,
                            expe_type='Validation',
                            delta_train=delta_train,
                            delta_delay=delta_delay,
                            delta_assessment=delta_assessment,
                            performance_metrics_list_grid=performance_metrics_list_grid,
                            performance_metrics_list=performance_metrics_list,
                            n_jobs=n_jobs)

    # Get performances on the test set using prequential validation
    print("--- Running Prequential Grid Search for Test Set Estimation ---")
    performances_df_test=prequential_grid_search(transactions_df, classifier,
                            input_features, output_feature,
                            parameters, scoring,
                            start_date_training=start_date_training_for_test,
                            n_folds=n_folds,
                            expe_type='Test',
                            delta_train=delta_train,
                            delta_delay=delta_delay,
                            delta_assessment=delta_assessment,
                            performance_metrics_list_grid=performance_metrics_list_grid,
                            performance_metrics_list=performance_metrics_list,
                            n_jobs=n_jobs)

    # Bind the two resulting DataFrames
    # Merge based on 'Parameters summary' to ensure rows align correctly
    if performances_df_test.empty and performances_df_validation.empty:
        print("Warning (model_selection_wrapper): Both Test and Validation results are empty.")
        return pd.DataFrame()
    elif performances_df_test.empty:
         print("Warning (model_selection_wrapper): Test results are empty. Returning only Validation results.")
         # Return validation results, maybe rename columns for consistency?
         return performances_df_validation 
    elif performances_df_validation.empty:
         print("Warning (model_selection_wrapper): Validation results are empty. Returning only Test results.")
         return performances_df_test
    else:
        # Both have results, merge them
        # Check if the crucial 'Parameters summary' column exists in both
        if 'Parameters summary' not in performances_df_test.columns or 'Parameters summary' not in performances_df_validation.columns:
             print("ERROR (model_selection_wrapper): 'Parameters summary' column missing. Cannot merge results. Returning Test results only.")
             return performances_df_test
             
        performances_df_validation_subset = performances_df_validation.drop(columns=['Parameters','Execution time'], errors='ignore') 
        # Outer merge to keep all parameter sets tested, even if one failed
        performances_df=pd.merge(performances_df_test, performances_df_validation_subset, on='Parameters summary', how='outer')

    # And return as a single DataFrame
    return performances_df


# ### kfold_cv_with_classifier
#
# First use in [Chapter 6, Cost-sensitive learning](Cost_Sensitive_Learning).
# NOTE: Not used in this notebook's main flow, kept for original function adherence.

# In[ ]:


def kfold_cv_with_classifier(classifier,
                             X,
                             y,
                             n_splits=5,
                             strategy_name="Basline classifier"):

    cv = sklearn.model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    cv_results_=sklearn.model_selection.cross_validate(classifier,X,y,cv=cv,
                                                       scoring=['roc_auc',
                                                                'average_precision',
                                                                'balanced_accuracy'],
                                                       return_estimator=True)

    results=round(pd.DataFrame(cv_results_),3)
    results_mean=list(results.mean().values)
    results_std=list(results.std().values)
    # Ensure correct indexing for results_mean/std
    num_metrics = len(results.columns) - 1 # Exclude 'estimator' column if present
    results_df=pd.DataFrame([[str(round(results_mean[i],3))+'+/-'+
                              str(round(results_std[i],3)) for i in range(num_metrics)]],
                            columns=['Fit time (s)','Score time (s)',
                                     'AUC ROC','Average Precision','Balanced accuracy'])
    results_df.rename(index={0:strategy_name}, inplace=True)

    classifier_0=cv_results_['estimator'][0]

    (train_index, test_index) = next(cv.split(X, y))
    
    # Handle X,y being numpy or pandas
    if isinstance(X, pd.DataFrame):
        train_df=pd.DataFrame({'X1':X.iloc[train_index,0],'X2':X.iloc[train_index,1], 'Y':y.iloc[train_index]})
        test_df=pd.DataFrame({'X1':X.iloc[test_index,0],'X2':X.iloc[test_index,1], 'Y':y.iloc[test_index]})
    elif isinstance(X, np.ndarray):
        train_df=pd.DataFrame({'X1':X[train_index,0],'X2':X[train_index,1], 'Y':y[train_index]})
        test_df=pd.DataFrame({'X1':X[test_index,0],'X2':X[test_index,1], 'Y':y[test_index]})
    else:
        raise TypeError("Input X must be a pandas DataFrame or numpy array")

    return (results_df, classifier_0, train_df, test_df)


# ## Plotting (Copied from Original)

# ### get_tx_stats
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).

# In[ ]:


# Compute the number of transactions per day, fraudulent transactions per day and fraudulent cards per day

def get_tx_stats(transactions_df, start_date_df="2018-04-01"):
    
    if transactions_df.empty or 'TX_TIME_DAYS' not in transactions_df.columns:
        print("Warning (get_tx_stats): Input DataFrame is empty or missing TX_TIME_DAYS.")
        return pd.DataFrame(columns=["tx_date", "nb_tx_per_day", "nb_fraudulent_transactions_per_day", "nb_compromised_cards_per_day"])
        
    #Number of transactions per day
    nb_tx_per_day=transactions_df.groupby(['TX_TIME_DAYS'])['CUSTOMER_ID'].count()
    #Number of fraudulent transactions per day
    nb_fraudulent_transactions_per_day=transactions_df.groupby(['TX_TIME_DAYS'])['TX_FRAUD'].sum()
    #Number of fraudulent cards per day
    nb_compromised_card_per_day=transactions_df[transactions_df['TX_FRAUD']==1].groupby(['TX_TIME_DAYS']).CUSTOMER_ID.nunique()

    tx_stats=pd.DataFrame({"nb_tx_per_day":nb_tx_per_day,
                           "nb_fraudulent_transactions_per_day":nb_fraudulent_transactions_per_day,
                           "nb_compromised_cards_per_day":nb_compromised_card_per_day})

    tx_stats=tx_stats.reset_index()
    
    # Fill NaN for days where no frauds/compromised cards occurred
    tx_stats.fillna(0, inplace=True)

    try:
        start_date = datetime.datetime.strptime(start_date_df, "%Y-%m-%d")
        # Use timedelta with days=... for clarity
        tx_stats['tx_date'] = tx_stats['TX_TIME_DAYS'].apply(lambda x: start_date + datetime.timedelta(days=x))
    except ValueError as e:
        print(f"Warning (get_tx_stats): Error parsing start_date_df '{start_date_df}': {e}. Cannot create 'tx_date' column.")
        tx_stats['tx_date'] = pd.NaT

    return tx_stats


# ### get_template_tx_stats
#
# First use in [Chapter 3, Baseline Fraud Detection System](Baseline_FDS).

# In[ ]:


# Plot the number of transactions per day, fraudulent transactions per day and fraudulent cards per day

def get_template_tx_stats(ax ,fs,
                          start_date_training,
                          title='',
                          delta_train=7,
                          delta_delay=7,
                          delta_test=7,
                          ylim=300):

    ax.set_title(title, fontsize=fs*1.5)
    ax.set_ylim([0, ylim])

    ax.set_xlabel('Date', fontsize=fs)
    ax.set_ylabel('Number', fontsize=fs)

    plt.yticks(fontsize=fs*0.7)
    plt.xticks(fontsize=fs*0.7, rotation=45, ha='right') # Added rotation for readability

    # Ensure dates are datetime objects
    train_end_date = start_date_training + datetime.timedelta(days=delta_train)
    delay_end_date = start_date_training + datetime.timedelta(days=delta_train + delta_delay)
    test_end_date = delay_end_date + datetime.timedelta(days=delta_test)
    
    ax.axvline(train_end_date, 0, ylim, color="black", linestyle='--') # Made lines dashed
    ax.axvline(delay_end_date, 0, ylim, color="black", linestyle='--')

    # Adjust text placement to avoid overlap with lines/data
    text_y_pos = ylim * 0.95
    ax.text(start_date_training + datetime.timedelta(days=delta_train/2), text_y_pos, 'Training', fontsize=fs, ha='center', va='top')
    ax.text(train_end_date + datetime.timedelta(days=delta_delay/2), text_y_pos, 'Delay', fontsize=fs, ha='center', va='top')
    ax.text(delay_end_date + datetime.timedelta(days=delta_test/2), text_y_pos, 'Test', fontsize=fs, ha='center', va='top')
    
    # Improve date formatting on x-axis if possible
    try:
      ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y-%m-%d'))
    except Exception as e:
        print(f"Note: Could not set date formatter: {e}")


# ### get_template_roc_curve
#
# First use in [Chapter 4, Threshold Free Metrics](Threshold_Free_Metrics).

# In[ ]:


def get_template_roc_curve(ax, title,fs,random=True):

    ax.set_title(title, fontsize=fs)
    ax.set_xlim([-0.01, 1.01])
    ax.set_ylim([-0.01, 1.01])

    ax.set_xlabel('False Positive Rate', fontsize=fs)
    ax.set_ylabel('True Positive Rate', fontsize=fs)

    if random:
        ax.plot([0, 1], [0, 1],'r--',label="AUC ROC Random = 0.5")
    ax.grid(True) # Add grid


# ### get_template_pr_curve
#
# First use in [Chapter 4, Threshold Free Metrics](Threshold_Free_Metrics).

# In[ ]:


def get_template_pr_curve(ax, title,fs, baseline=0.5):
    ax.set_title(title, fontsize=fs)
    ax.set_xlim([-0.01, 1.01])
    ax.set_ylim([-0.01, 1.01])

    ax.set_xlabel('Recall (True Positive Rate)', fontsize=fs)
    ax.set_ylabel('Precision', fontsize=fs)

    # Baseline calculation should ideally use the actual fraud rate of the dataset being plotted
    ax.plot([0, 1], [baseline, baseline],'r--',label='AP Random = {0:0.3f}'.format(baseline))
    ax.grid(True) # Add grid


# ### get_performance_plot
#
# First use in [Chapter 5, Validation Strategies](Validation_Strategies).

# In[ ]:


# Get the performance plot for a single performance metric
def get_performance_plot(performances_df,
                         ax,
                         performance_metric,
                         expe_type_list=['Test','Validation'],
                         expe_type_color_list=['#008000','#FF0000'], # Green for Test, Red for Validation
                         parameter_name="Parameter summary",
                         summary_performances=None):
    if performances_df.empty:
        print(f"Skipping plot for {performance_metric}: No data.")
        ax.text(0.5, 0.5, 'No performance data', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(performance_metric+'\n', fontsize=14)
        return

    # Check for parameter column
    if parameter_name not in performances_df.columns:
         print(f"Warning (get_performance_plot): Parameter column '{parameter_name}' not found. Cannot plot.")
         ax.text(0.5, 0.5, f'Missing column:\n{parameter_name}', ha='center', va='center', transform=ax.transAxes)
         ax.set_title(performance_metric+'\n', fontsize=14)
         return
         
    parameter_summary_col_raw = performances_df[parameter_name]
    # Convert parameters to string if they are dicts or lists for plotting
    # Use apply(str) for robustness
    parameter_summary_col = parameter_summary_col_raw.apply(str)

    parameter_ticks = np.arange(len(parameter_summary_col))

    # Plot lines and confidence intervals
    for i in range(len(expe_type_list)):
        expe_type = expe_type_list[i]
        performance_metric_expe_type=performance_metric+' '+expe_type
        if performance_metric_expe_type not in performances_df.columns:
            print(f"Note: Metric column '{performance_metric_expe_type}' not found. Skipping plot for {expe_type}.")
            continue # Skip if column doesn't exist

        # Ensure data is numeric, replace non-numeric with NaN for plotting
        perf_data = pd.to_numeric(performances_df[performance_metric_expe_type], errors='coerce')

        # Only plot if there is some non-NaN data
        if not perf_data.isna().all():
            ax.plot(parameter_ticks, perf_data,
                    color=expe_type_color_list[i], label = expe_type, marker='o')

            std_col = performance_metric_expe_type+' Std'
            if std_col in performances_df.columns:
                std_data = pd.to_numeric(performances_df[std_col], errors='coerce')
                # Check std_data is not all NaN before calculating bounds
                if not std_data.isna().all():
                    conf_min = perf_data - 2*std_data
                    conf_max = perf_data + 2*std_data
                    ax.fill_between(parameter_ticks, conf_min, conf_max, color=expe_type_color_list[i], alpha=.1)
            else:
                 print(f"Note: Std Dev column '{std_col}' not found for {expe_type}.")

    # Add vertical line for best validation parameter
    if summary_performances is not None and not summary_performances.empty and performance_metric in summary_performances.columns:
        best_estimated_parameter_str = str(summary_performances.loc["Best estimated parameters", performance_metric])
        val_perf_str = summary_performances.loc["Validation performance", performance_metric]
        if val_perf_str != 'N/A' and val_perf_str != 'NaN' and best_estimated_parameter_str != 'N/A':
            try:
                best_estimated_performance=float(val_perf_str.split("+/-")[0])
                ymin, ymax = ax.get_ylim()
                # Find the position of the parameter string for vlines
                param_indices = parameter_summary_col[parameter_summary_col == best_estimated_parameter_str].index
                if not param_indices.empty:
                    param_pos_index = param_indices[0]
                    # Check if index is within bounds of parameter_ticks
                    if param_pos_index < len(parameter_ticks):
                        param_pos = parameter_ticks[param_pos_index]
                        ax.vlines(param_pos, ymin, best_estimated_performance, linestyles="dashed", color='red', label='Best Validation Param')
                    else:
                        print(f"Warning (get_performance_plot): Index {param_pos_index} out of bounds for parameter ticks.")
                else:
                    print(f"Warning (get_performance_plot): Best parameter '{best_estimated_parameter_str}' not found for plotting vline.")
            except Exception as e:
                 print(f"Warning (get_performance_plot): Error plotting vline: {e}")

    ax.set_title(performance_metric+'\n', fontsize=14)
    ax.set_xlabel(parameter_name, fontsize=12)
    ax.set_ylabel(performance_metric, fontsize=12)
    ax.set_xticks(parameter_ticks) # Use integer positions for ticks
    ax.set_xticklabels(parameter_summary_col, rotation=45, ha='right') # Use original labels
    ax.grid(True, axis='y', linestyle=':') # Add y-axis grid

    # Handle legend: Collect unique labels
    handles, labels = ax.get_legend_handles_labels()
    if handles: # Only add legend if there are items to show
        by_label = dict(zip(labels, handles))
        ax.legend(by_label.values(), by_label.keys())


# ### get_performances_plots
#
# First use in [Chapter 5, Validation Strategies](Validation_Strategies).

# In[ ]:


# Get the performance plots for a set of performance metric
def get_performances_plots(performances_df,
                           performance_metrics_list=['AUC ROC', 'Average precision', 'Card Precision@100'],
                           expe_type_list=['Test','Validation'], expe_type_color_list=['#008000','#FF0000'],
                           parameter_name="Parameter summary",
                           summary_performances=None):
    if performances_df.empty:
        print("Cannot generate plots: Performance dataframe is empty.")
        return None # Return None if no plots generated
        
    n_performance_metrics = len(performance_metrics_list)
    fig, axes = plt.subplots(1, n_performance_metrics, figsize=(6*n_performance_metrics, 6)) # Increased height slightly
    if n_performance_metrics == 1:
        axes = [axes] # Ensure axes is iterable
        
    for i in range(n_performance_metrics):
        get_performance_plot(performances_df, axes[i], performance_metric=performance_metrics_list[i],
                             expe_type_list=expe_type_list,
                             expe_type_color_list=expe_type_color_list,
                             parameter_name=parameter_name,
                             summary_performances=summary_performances)

    # Consolidate legends outside the plot area
    handles, labels = [], []
    for ax in axes:
        h, l = ax.get_legend_handles_labels()
        # Filter out potential duplicates if vline label added multiple times
        for label, handle in zip(l, h):
            if label not in labels:
                labels.append(label)
                handles.append(handle)
                
    if handles: # Only add legend if there are items
        fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=len(labels), title="Set Type / Marker")
        
    plt.tight_layout(rect=[0, 0.05, 1, 0.95]) # Adjust layout and add padding
    # plt.show() # Don't show automatically in MLOps context, return fig
    return fig


# ### get_execution_times_plot
#
# First use in [Chapter 5, Validation Strategies](Validation_Strategies).

# In[ ]:


# Get the performance plot for a single performance metric
def get_execution_times_plot(performances_df,
                             title="Mean Fit Time per Parameter Set", # Adjusted title
                             parameter_name="Parameter summary"):
    
    if performances_df.empty or 'Execution time' not in performances_df.columns or parameter_name not in performances_df.columns:
        print("Cannot generate execution time plot: Required data missing.")
        return None

    fig, ax = plt.subplots(1,1, figsize=(8, 5)) # Adjusted size slightly

    # Ensure data is numeric
    exec_time_data = pd.to_numeric(performances_df["Execution time"], errors='coerce')
    parameter_summary_col = performances_df[parameter_name].apply(str)
    parameter_ticks = np.arange(len(parameter_summary_col))

    # Plot data on graph only if not all NaN
    if not exec_time_data.isna().all():
        ax.plot(parameter_ticks, exec_time_data, color="black", marker='o')
    else:
        ax.text(0.5, 0.5, 'No execution time data', ha='center', va='center', transform=ax.transAxes)

    # Set title, and x and y axes labels
    ax.set_title(title, fontsize=14)
    ax.set(xlabel = parameter_name, ylabel="Mean Fit Time (seconds)")
    # Use integer positions for ticks and original labels
    ax.set_xticks(parameter_ticks)
    ax.set_xticklabels(parameter_summary_col, rotation=45, ha='right')
    ax.grid(True, axis='y', linestyle=':') # Add y-axis grid
    plt.tight_layout()
    # plt.show()
    return fig


# ### get_model_selection_performances_plots (Plotting function)
#
# First use in [Chapter 5, Model Selection](Model_Selection).
# Renamed from original to avoid conflict

# In[ ]:


# ### get_model_selection_performances_plots (Plotting function)
#
# First use in [Chapter 5, Model Selection](Model_Selection).
# Renamed from original to avoid conflict

# In[ ]:

# Get the performance plot for a single performance metric comparison
def plot_model_selection_performance_bar(performances_df_dictionary,
                                         ax,
                                         performance_metric,
                                         ylim=[0,1],
                                         model_classes=['Decision Tree', 'Logistic Regression', 'Random Forest', 'XGBoost']):

    # ** ALIGNED WITH ORIGINAL NOTEBOOK DEFAULTS USED IN COMPARISON PLOT **
    default_parameters_dict_for_lookup={
         "Decision Tree": 'max_depth=None, random_state=0',
         "Logistic Regression": 'C=1.0, random_state=0',
         "Random Forest": "max_depth=None, n_estimators=100, n_jobs=1, random_state=0",
         "XGBoost": "eval_metric=logloss, learning_rate=0.3, max_depth=6, n_estimators=100, n_jobs=1, random_state=0, use_label_encoder=False, verbosity=0"
     }

    # Get the mean/std performance data using the helper function
    (mean_performances_dictionary,std_performances_dictionary) = model_selection_performances(
                                     performances_df_dictionary=performances_df_dictionary,
                                     performance_metric=performance_metric,
                                     model_classes=model_classes,
                                     default_parameters_dictionary=default_parameters_dict_for_lookup
                                     )

    barWidth = 0.25
    r = np.arange(len(model_classes))
    r1 = r - barWidth
    r2 = r
    r3 = r + barWidth

    # Handle potential NaNs when plotting and getting errors
    # pd.to_numeric might return numpy arrays
    default_means = pd.to_numeric(mean_performances_dictionary.get('Default parameters', [np.nan]*len(model_classes)), errors='coerce')
    default_stds = pd.to_numeric(std_performances_dictionary.get('Default parameters', [np.nan]*len(model_classes)), errors='coerce')
    best_val_means = pd.to_numeric(mean_performances_dictionary.get('Best validation parameters', [np.nan]*len(model_classes)), errors='coerce')
    best_val_stds = pd.to_numeric(std_performances_dictionary.get('Best validation parameters', [np.nan]*len(model_classes)), errors='coerce')
    optimal_means = pd.to_numeric(mean_performances_dictionary.get('Optimal parameters', [np.nan]*len(model_classes)), errors='coerce')
    optimal_stds = pd.to_numeric(std_performances_dictionary.get('Optimal parameters', [np.nan]*len(model_classes)), errors='coerce')

    # *** CORRECTED LINES ***
    # Replace NaN std devs with 0 using np.nan_to_num for plotting error bars
    default_stds = np.nan_to_num(default_stds, nan=0.0)
    best_val_stds = np.nan_to_num(best_val_stds, nan=0.0)
    optimal_stds = np.nan_to_num(optimal_stds, nan=0.0)
    # *** END CORRECTION ***

    ax.bar(r1, default_means, width=barWidth, color='#CA8035', edgecolor='black',
           yerr=default_stds, capsize=5, label='Default parameters')
    ax.bar(r2, best_val_means, width=barWidth, color='#008000', edgecolor='black',
           yerr=best_val_stds, capsize=5, label='Best validation parameters')
    ax.bar(r3, optimal_means, width=barWidth, color='#2F4D7E', edgecolor='black',
           yerr=optimal_stds, capsize=5, label='Optimal parameters')

    ax.set_ylim(ylim[0],ylim[1])
    ax.set_xticks(r) # Center ticks on the groups
    ax.set_xticklabels(model_classes, rotation = 45, ha="right", fontsize=12)
    ax.set_title(performance_metric+'\n', fontsize=16)
    ax.set_ylabel(performance_metric, fontsize=14)
    ax.grid(True, axis='y', linestyle=':') # Add y-axis grid



# ### plot_decision_boundary_classifier
#
# First use in [Chapter 6, Cost-sensitive learning](Cost_Sensitive_Learning).
# NOTE: Only works for 2 input features, not used in main flow here.

# In[ ]:


def plot_decision_boundary_classifier(ax,
                                      classifier,
                                      train_df,
                                      input_features=['X1','X2'],
                                      output_feature='Y',
                                      title="",
                                      fs=14,
                                      plot_training_data=True):

    # Check if exactly 2 input features are provided
    if len(input_features) != 2:
        print("Warning (plot_decision_boundary): Can only plot decision boundary for exactly 2 input features.")
        ax.text(0.5, 0.5, 'Plot requires 2 features', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(title, fontsize=fs)
        return
        
    plot_colors = ["tab:blue","tab:orange"]

    x1_min, x1_max = train_df[input_features[0]].min() - 1, train_df[input_features[0]].max() + 1
    x2_min, x2_max = train_df[input_features[1]].min() - 1, train_df[input_features[1]].max() + 1

    plot_step=0.1
    xx, yy = np.meshgrid(np.arange(x1_min, x1_max, plot_step),
                         np.arange(x2_min, x2_max, plot_step))

    # Predict probabilities if possible
    if hasattr(classifier, "predict_proba"):
        Z = classifier.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    else:
        Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
        
    Z = Z.reshape(xx.shape)
    cs = ax.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu_r, alpha=0.3)

    if plot_training_data:
        # Plot the training points
        groups = train_df.groupby(output_feature)
        for name, group in groups:
            ax.scatter(group[input_features[0]], group[input_features[1]], edgecolors='black', label=str(name), s=20) # Smaller points

    ax.set_title(title, fontsize=fs)
    ax.set_xlabel(input_features[0], fontsize=fs)
    ax.set_ylabel(input_features[1], fontsize=fs)
    ax.grid(True, linestyle=':')
    
    # Add color bar if predict_proba was used
    if hasattr(classifier, "predict_proba"):
        cbar = plt.colorbar(cs, ax=ax)
        cbar.set_label('Predicted Fraud Probability', rotation=270, labelpad=15)


# ### plot_decision_boundary
#
# First use in [Chapter 6, Cost-sensitive learning](Cost_Sensitive_Learning).
# NOTE: Only works for 2 input features, not used in main flow here.

# In[ ]:


def plot_decision_boundary(classifier_0,
                           train_df,
                           test_df):
                           
    # Assume input features are the first two columns if not specified
    input_features_plot = list(train_df.columns[:2])
    output_feature_plot = train_df.columns[2] if len(train_df.columns) > 2 else 'Y'
    
    if len(input_features_plot) != 2:
         print("Warning (plot_decision_boundary): Cannot plot, requires exactly 2 features.")
         return None
         
    fig_decision_boundary, ax = plt.subplots(1, 3, figsize=(18, 5))

    plot_decision_boundary_classifier(ax[0], classifier_0,
                                      train_df,
                                      input_features=input_features_plot,
                                      output_feature=output_feature_plot,
                                      title="Decision surface\n With training data",
                                      plot_training_data=True)

    plot_decision_boundary_classifier(ax[1], classifier_0,
                                      train_df, # Use train_df for boundary calculation
                                      input_features=input_features_plot,
                                      output_feature=output_feature_plot,
                                      title="Decision surface\n",
                                      plot_training_data=False)


    plot_decision_boundary_classifier(ax[2], classifier_0,
                                      test_df,
                                      input_features=input_features_plot,
                                      output_feature=output_feature_plot,
                                      title="Decision surface\n With test data",
                                      plot_training_data=True)

    # Consolidate legend based on the plot with data points
    handles, labels = ax[2].get_legend_handles_labels()
    if handles:
        fig_decision_boundary.legend(handles, labels, loc='center right',
                                   bbox_to_anchor=(0.98, 0.5), # Adjust position
                                   title="Class")

    # Color bar - associated with the contourf plot (cs)
    # We need to retrieve 'cs' from one of the plots, e.g., the middle one
    # Re-run the middle plot to capture the contour set 'cs'
    x1_min, x1_max = train_df[input_features_plot[0]].min() - 1, train_df[input_features_plot[0]].max() + 1
    x2_min, x2_max = train_df[input_features_plot[1]].min() - 1, train_df[input_features_plot[1]].max() + 1
    plot_step=0.1
    xx, yy = np.meshgrid(np.arange(x1_min, x1_max, plot_step), np.arange(x2_min, x2_max, plot_step))
    if hasattr(classifier_0, "predict_proba"):
        Z = classifier_0.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
        Z = Z.reshape(xx.shape)
        cs = ax[1].contourf(xx, yy, Z, cmap=plt.cm.RdYlBu_r, alpha=0.3) # Re-plot to get cs
        sm = plt.cm.ScalarMappable(cmap=plt.cm.RdYlBu_r, norm=cs.norm)
        sm.set_array([]) # Needed for colorbar
        # Place colorbar relative to the figure, not a specific axis
        cax = fig_decision_boundary.add_axes([0.93, 0.15, 0.02, 0.7]) # Position: [left, bottom, width, height]
        fig_decision_boundary.colorbar(sm, cax=cax, label='Predicted Fraud Probability')

    plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust layout to prevent overlap with colorbar/legend

    return fig_decision_boundary



## 1. Start MLflow Run & Define Parameters (Aligned with Original)

In [None]:
# --- Start Parent MLflow Run for the entire notebook execution ---
mlflow_run = mlflow.start_run(run_name="final_MLOps_Model_Building_Selection_Run") # Adjusted name
run_id = mlflow_run.info.run_id
print(f"Started MLflow Run ID: {run_id}")
if mlflow.get_tracking_uri().startswith("azureml"): # Check if tracking Azure ML
    # Construct Azure ML portal URL (replace with your specific details if needed)
    # Assuming standard Azure public cloud
    azure_portal_url = f"https://ml.azure.com/experiments/guid/{mlflow_run.info.experiment_id}/runs/{run_id}?wsid=/subscriptions/{ws.subscription_id}/resourcegroups/{ws.resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{ws.name}"
    print(f"MLflow Run Azure ML UI: {azure_portal_url}")
else:
    print(f"MLflow Run local UI: http://localhost:5000/#/experiments/{mlflow_run.info.experiment_id}/runs/{run_id}") # Assuming default local port

# --- Log Tags and Environment Info ---
mlflow.set_tag("Workflow Step", "Model Building & Selection (Aligned)")
# mlflow.set_tag("Git Commit", GIT_COMMIT)
# mlflow.set_tag("Git Repo URL", GIT_REPO_URL)
mlflow.set_tag("Python Version", sys.version.split('|')[0].strip())
mlflow.set_tag("MLflow Version", mlflow.__version__)
mlflow.set_tag("Scikit-learn Version", sklearn.__version__)
mlflow.set_tag("XGBoost Version", xgboost.__version__)
if ws:
    mlflow.set_tag("Azure ML Workspace", ws.name)

# --- Log Input Data Information ---
if input_dataset:
    mlflow.set_tag("Input Dataset Name", input_dataset.name)
    mlflow.set_tag("Input Dataset Version", input_dataset.version)
    mlflow.set_tag("Input Dataset ID", input_dataset.id)
else:
    mlflow.set_tag("Input Dataset Name", "ERROR - Not Loaded")
    print("WARNING: Input dataset object not available for MLflow logging.")

# --- Define and Log Core Parameters (ALIGNED WITH ORIGINAL NOTEBOOK cell 86e7ebefca3940668a4b03082ff390e8) ---

# Define Data Loading Range (Matches original notebook)
BEGIN_DATE_LOAD = "2025-06-11"
END_DATE_LOAD = "2025-08-14"

# Define Training / Validation / Test Periods Parameters (Matches original notebook)
DELTA_TRAIN = 7
DELTA_DELAY = 7
DELTA_ASSESSMENT = 7 # Used for validation and test period lengths
DELTA_VALID = DELTA_ASSESSMENT # Original used this implicitly
DELTA_TEST = DELTA_ASSESSMENT # Original used this implicitly
N_FOLDS = 4 # Number of folds for prequential validation
TOP_K_VALUE = 100 # For Card Precision@k metric (used in original)

# Define the _anchor_ training start date (Matches original notebook)
START_DATE_TRAINING_ANCHOR_STR = "2025-07-25"
start_date_training_anchor = datetime.datetime.strptime(START_DATE_TRAINING_ANCHOR_STR, "%Y-%m-%d")

# Calculate start dates relative to the anchor (Matches original notebook calculations)
start_date_training_for_valid = start_date_training_anchor - datetime.timedelta(days=(DELTA_DELAY + DELTA_VALID)) # Used for model_selection_wrapper Validation
start_date_training_for_test_estimation = start_date_training_anchor # Used for model_selection_wrapper Test



# **REVISED Final Split Dates based on original notebook's Step 3/4 usage:**
start_date_training_final = start_date_training_anchor # Use anchor date as start for final training
start_date_test_final = start_date_training_final + datetime.timedelta(days=(DELTA_TRAIN + DELTA_DELAY))

# Log parameters to MLflow
mlflow.log_param("data_load_begin_date", BEGIN_DATE_LOAD)
mlflow.log_param("data_load_end_date", END_DATE_LOAD)
mlflow.log_param("delta_train", DELTA_TRAIN)
mlflow.log_param("delta_delay", DELTA_DELAY)
mlflow.log_param("delta_assessment", DELTA_ASSESSMENT)
mlflow.log_param("prequential_n_folds", N_FOLDS)
mlflow.log_param("card_precision_top_k", TOP_K_VALUE)
mlflow.log_param("anchor_date_train_start", START_DATE_TRAINING_ANCHOR_STR)
# Log the actual start dates used for validation/test estimation loops
mlflow.log_param("validation_gridsearch_start_date", start_date_training_for_valid.strftime('%Y-%m-%d'))
mlflow.log_param("test_estimation_gridsearch_start_date", start_date_training_for_test_estimation.strftime('%Y-%m-%d'))
# Log the final split dates
mlflow.log_param("final_train_start_date", start_date_training_final.strftime('%Y-%m-%d'))
mlflow.log_param("final_test_start_date", start_date_test_final.strftime('%Y-%m-%d'))

print(f"Anchor Training Start Date: {start_date_training_anchor.strftime('%Y-%m-%d')}")
print(f"Validation GridSearch Start (Fold 0 Train Start): {start_date_training_for_valid.strftime('%Y-%m-%d')}")
print(f"Test Estimation GridSearch Start (Fold 0 Train Start): {start_date_training_for_test_estimation.strftime('%Y-%m-%d')}")
print(f"--- FINAL SPLIT DATES ---")
print(f"Final Training Set Start Date (for Step 5): {start_date_training_final.strftime('%Y-%m-%d')}")
print(f"Final Test Set Start Date (for Step 6): {start_date_test_final.strftime('%Y-%m-%d')}")
print(f"-------------------------")

# Define Features (Matches original notebook cell 86e7ebefca3940668a4b03082ff390e8)
OUTPUT_FEATURE = "TX_FRAUD"
INPUT_FEATURES = ['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
                  'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
                  'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
                  'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
                  'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
                  'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
                  'TERMINAL_ID_RISK_30DAY_WINDOW']

mlflow.log_param("output_feature", OUTPUT_FEATURE)
# Log input features as a list (log as JSON string for MLflow)
mlflow.log_param("input_features", json.dumps(INPUT_FEATURES))



In [None]:
# mlflow.end_run()
# print(f"\nFinished and closed MLflow Run ID: {run_id}")

## 2. Load Versioned Data (Transformed Features)

Load the pre-processed data generated by the feature engineering notebook. Assumes `dvc pull` has been run if necessary to ensure the correct version (tracked by `simulated-data-transformed.dvc`) is present locally in the path defined by `TRANSFORMED_DATA_LOCAL_PATH`.

In [None]:

transactions_df = pd.DataFrame() # Initialize
load_start_time = time.time()

DIR_INPUT = None # Will be set inside the 'with' block

try:
    # Mount the dataset to a temporary local path
    with input_dataset.mount() as mount_context:
        DIR_INPUT = mount_context.mount_point # Set DIR_INPUT to the mounted path
        print(f"Dataset mounted temporarily at: {DIR_INPUT}")
        print(f"\nLoading data from {DIR_INPUT} between {BEGIN_DATE_LOAD} and {END_DATE_LOAD}...")
        # --- YOUR EXACT CODE BLOCK STARTS HERE ---
        # Use the read_from_files function copied from the original notebook
        transactions_df = read_from_files(DIR_INPUT, BEGIN_DATE_LOAD, END_DATE_LOAD)
        load_exec_time = time.time() - load_start_time

        if transactions_df.empty:
            print("ERROR: Loaded DataFrame is empty. Check input path, date range, and file contents.")
            mlflow.set_tag("Data Loading Status", "Failed - Empty DataFrame")
            # Stop execution if data loading fails
            raise SystemExit("Stopping execution: Failed to load transaction data.")
        else:
            print(f"{len(transactions_df)} transactions loaded in {load_exec_time:.2f}s, containing {transactions_df.TX_FRAUD.sum()} fraudulent transactions.")
            mlflow.log_metric("loaded_data_rows", len(transactions_df))
            mlflow.log_metric("loaded_data_fraud_count", transactions_df.TX_FRAUD.sum())
            mlflow.log_metric("loaded_data_load_time_sec", load_exec_time)
            mlflow.set_tag("Data Loading Status", "Completed - Success")
            # Basic data validation check (using required columns from original functions)
            required_cols = INPUT_FEATURES + [OUTPUT_FEATURE, 'TX_DATETIME', 'TX_TIME_DAYS', 'CUSTOMER_ID', 'TRANSACTION_ID']
            missing_cols = [col for col in required_cols if col not in transactions_df.columns]
            if missing_cols:
                print(f"ERROR: Loaded data missing required columns: {missing_cols}")
                mlflow.set_tag("Data Validation", f"Failed - Missing columns: {missing_cols}")
                raise SystemExit(f"Stopping execution: Missing required columns {missing_cols}")
            else:
                 mlflow.set_tag("Data Validation", "Passed - Required columns present")
                 # Check data types (optional but good practice)
                 if not pd.api.types.is_datetime64_any_dtype(transactions_df['TX_DATETIME']):
                      print("Warning: TX_DATETIME column is not datetime type after loading.")
                      mlflow.set_tag("Data Validation Warning", "TX_DATETIME type incorrect")
        # --- YOUR EXACT CODE BLOCK ENDS HERE ---

# --- Exception Handling (kept outside the 'with' but catches errors from inside) ---
except SystemExit as e:
    print(e)
    # Optionally end MLflow run if stopping and active
    if mlflow.active_run():
        mlflow.end_run(status="FAILED")
    # Re-raise to ensure notebook stops
    raise
except Exception as e:
    # This will catch errors from input_dataset.mount() OR from your code block inside the 'with'
    print(f"ERROR during data loading or mounting: {e}")
    mlflow.set_tag("Data Loading Status", f"Failed - Exception: {type(e).__name__}")
    mlflow.log_param("Error Message", str(e)) # Log the error message
    import traceback
    traceback.print_exc()
    # Stop execution and mark run as failed
    if mlflow.active_run():
        mlflow.end_run(status="FAILED")
    raise SystemExit(f"Stopping execution: Error during data loading: {e}")


# --- Display head if loaded successfully ---
if not transactions_df.empty:
    print("\nLoaded Data Head:")
    # Use display if available (Jupyter/IPython), otherwise print
    try:
        display(transactions_df.head())
    except NameError:
        print(transactions_df.head())

# --- Optional: End MLflow run if this is the *only* step in the run ---
# if mlflow.active_run():
#     mlflow.end_run(status="FINISHED") # Mark as finished if successful to this point

# --- The 'transactions_df' is now ready for subsequent steps (splitting, training) ---
print("\nData loading and validation complete.")

## 3. Define Final Training and Test Sets (Aligned with Original)

We create the specific final train/test split based on the **exact dates and logic** from the original notebook (`start_date_training_final`, `start_date_test_final`). This split will be used to train the _final selected_ model (Step 5) and evaluate its performance on unseen data (Step 6).

In [None]:
train_df_final = pd.DataFrame()
test_df_final = pd.DataFrame()

if not transactions_df.empty:
    print(f"\nCreating final train/test split (using function from original notebook)...")
    print(f"Final Train Start: {start_date_training_final.strftime('%Y-%m-%d')}, Delta: {DELTA_TRAIN} days")
    # Note: DELTA_TEST here refers to the duration of the final test period, which is DELTA_ASSESSMENT
    print(f"Final Test Start: {start_date_test_final.strftime('%Y-%m-%d')}, Delta: {DELTA_TEST} days, Delay: {DELTA_DELAY} days")
    
    try:
        # Use the get_train_test_set function copied from the original notebook
        (train_df_final, test_df_final) = get_train_test_set(transactions_df,
                                                            start_date_training=start_date_training_final,
                                                            delta_train=DELTA_TRAIN,
                                                            delta_delay=DELTA_DELAY,
                                                            delta_test=DELTA_TEST) # delta_test is same as delta_assessment
        
        print(f"\nFinal training set shape: {train_df_final.shape}")
        print(f"Final test set shape: {test_df_final.shape}")
        mlflow.log_metric("final_train_set_rows", train_df_final.shape[0] if not train_df_final.empty else 0)
        mlflow.log_metric("final_test_set_rows", test_df_final.shape[0] if not test_df_final.empty else 0)
        
        if not train_df_final.empty:
            final_train_fraud_rate = train_df_final[OUTPUT_FEATURE].mean()
            print(f"Final training set fraud rate: {final_train_fraud_rate:.4f}")
            mlflow.log_metric("final_train_set_fraud_rate", final_train_fraud_rate)
        else:
             print("ERROR: Final training set is empty.")
             mlflow.set_tag("Final Split Status", "Failed - Empty Train Set")
             raise SystemExit("Stopping execution: Final training set is empty.")
             
        if not test_df_final.empty:
            final_test_fraud_rate = test_df_final[OUTPUT_FEATURE].mean()
            print(f"Final test set fraud rate: {final_test_fraud_rate:.4f}")
            mlflow.log_metric("final_test_set_fraud_rate", final_test_fraud_rate)
        else:
            # An empty test set might be acceptable depending on the period, but flag it.
            print("Warning: Final test set is empty.")
            mlflow.set_tag("Final Split Status", "Warning - Empty Test Set")
        
        if not train_df_final.empty and not test_df_final.empty:
             mlflow.set_tag("Final Split Status", "Success")
             
    except SystemExit as e:
        print(e)
        raise # Re-raise to stop notebook
    except Exception as e:
        print(f"ERROR creating final train/test split: {e}")
        mlflow.set_tag("Final Split Status", f"Failed - Exception: {e}")
        import traceback
        traceback.print_exc()
        raise SystemExit(f"Stopping execution: Error creating final split: {e}")

else:
    print("Skipping final train/test split creation as loaded data is empty.")
    raise SystemExit("Stopping execution: Cannot proceed without loaded data.")
    


## 4. Model Selection Process (Aligned with Original)

### 4a. Define Candidate Models & Hyperparameters (Copied from Original)

In [None]:
# Define candidate classifiers (Copied EXACTLY from original notebook cell e53d98b98f3844519b3f24ef04220d6c)
classifiers_dictionary={
    'Logistic Regression':sklearn.linear_model.LogisticRegression(solver='liblinear', random_state=0),
    'Decision Tree':sklearn.tree.DecisionTreeClassifier(random_state=0),
    'Random Forest':sklearn.ensemble.RandomForestClassifier(random_state=0, n_jobs=1), # Original used n_jobs=1
    'XGBoost':xgboost.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss', n_jobs=1) # Original used n_jobs=1
}

# Define hyperparameter grids (Copied EXACTLY from original notebook cell e53d98b98f3844519b3f24ef04220d6c)
# Note the correction made in the original cell for XGBoost verbosity was `verbosity=0` (no comma)
parameters_dictionary={
    'Logistic Regression': {
        'clf__C':[0.1, 1, 10, 100],
        'clf__random_state':[0] # Keep for reproducibility if needed by solver
        },
    'Decision Tree': {
        'clf__max_depth':[2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50],
        'clf__random_state':[0]
        },
    'Random Forest': {
        'clf__n_estimators': [25, 50, 100],
        'clf__max_depth': [5, 10, 20, 50],
        'clf__random_state':[0],
        'clf__n_jobs':[1] # Explicitly set as in original
        },
    'XGBoost': {
        'clf__max_depth': [3, 6, 9],
        'clf__n_estimators': [25, 50, 100],
        'clf__learning_rate': [0.1, 0.3],
        'clf__random_state':[0],
        'clf__n_jobs':[1], # Explicitly set as in original
        'clf__verbosity':[0], # Original definition
        'clf__use_label_encoder':[False], # Explicitly set as in original definition
        'clf__eval_metric':['logloss']     # Explicitly set as in original definition
    }
}

print("Parameter Grids (Aligned with Original Notebook):")
for model, params in parameters_dictionary.items():
    # Print relevant tuning parameters only for clarity
    print_params = {k:v for k,v in params.items() if k not in ['clf__random_state', 'clf__n_jobs', 'clf__verbosity', 'clf__use_label_encoder', 'clf__eval_metric']}
    print(f"  {model}: {print_params}")

# Log the parameter grids as a JSON artifact for reference
params_log_path = "parameter_grids_aligned.json"
try:
    # Convert non-serializable items (like None type in DT/RF max_depth if it were used) to strings for JSON
    # Here, the grids only contain basic types, but this is safer general practice
    def make_serializable(val):
        if isinstance(val, list):
            return [make_serializable(item) for item in val]
        if val is None:
            return 'None'
        # Add other types if needed (e.g., np.nan)
        return val
        
    serializable_params = {model: {k: make_serializable(v) for k, v in p.items()} 
                           for model, p in parameters_dictionary.items()}
    with open(params_log_path, 'w') as f:
        json.dump(serializable_params, f, indent=4)
    mlflow.log_artifact(params_log_path)
    print(f"\nAligned parameter grids logged to MLflow artifact: {params_log_path}")
except Exception as e:
     print(f"Warning: Could not log parameter grids artifact: {e}")
        


### 4b & 4c. Define Validation Strategy (Prequential) & Metrics (Aligned with Original)

In [None]:
# Prequential split parameters (N_FOLDS, DELTA_TRAIN, etc.) are defined and logged above

# Create the scorer dataframe subset _only if_ data was loaded
transactions_df_scorer = pd.DataFrame() # Initialize
if not transactions_df.empty:
    # Only keep columns needed as argument to the custom scoring function
    # (to reduce the serialization time of transaction dataset in GridSearchCV)
    try:
        # Use columns required by card_precision_top_k_custom -> card_precision_top_k
        scorer_cols = ['CUSTOMER_ID', 'TX_FRAUD','TX_TIME_DAYS']
        missing_scorer_cols = [col for col in scorer_cols if col not in transactions_df.columns]
        if missing_scorer_cols:
            print(f"ERROR: Missing columns required for scorer DF: {missing_scorer_cols}")
            mlflow.set_tag("Scorer Setup", f"Failed - Missing columns: {missing_scorer_cols}")
        else:
            transactions_df_scorer = transactions_df[scorer_cols].copy()
            print(f"Created scorer helper DataFrame with shape: {transactions_df_scorer.shape}")
    except KeyError as e:
        print(f"ERROR: Missing required columns for scorer DataFrame: {e}")
        mlflow.set_tag("Scorer Setup", f"Failed - Missing columns: {e}")
else:
    print("Error: transactions_df is empty, cannot create scorer helper DataFrame.")
    mlflow.set_tag("Scorer Setup", "Failed - Empty Input Data")
    # Cannot proceed without scorer df if CP@k is used
    raise SystemExit("Stopping: Cannot create scorer dataframe.")

# Define the custom scorer for Card Precision@k using the function from original notebook
card_precision_top_k_scorer = None
if not transactions_df_scorer.empty:
    try:
        # The make_scorer call uses the card_precision_top_k_custom function defined in the shared functions cell
        card_precision_top_k_scorer = sklearn.metrics.make_scorer(card_precision_top_k_custom,
                                                                  needs_proba=True,
                                                                  top_k=TOP_K_VALUE,
                                                                  transactions_df=transactions_df_scorer)
        print(f"Custom scorer 'card_precision@{TOP_K_VALUE}' created successfully.")
        mlflow.set_tag("Scorer Setup", "Success")
    except Exception as e:
        print(f"ERROR creating custom scorer: {e}")
        mlflow.set_tag("Scorer Setup", f"Failed - Scorer Creation Error: {e}")
        # Optionally raise an error if this scorer is essential
        # raise SystemExit("Stopping: Failed to create CP@k scorer.")

# Define the scoring dictionary for GridSearchCV (Aligned EXACTLY with original cell 515762f3dad6410da1375f633164bd06)
scoring = {'roc_auc':'roc_auc',
           'average_precision': 'average_precision',
           # Only add the custom scorer if it was created successfully
           **({f'card_precision@{TOP_K_VALUE}': card_precision_top_k_scorer} if card_precision_top_k_scorer else {}) 
           }

# Check if the custom scorer is actually in the dict
if f'card_precision@{TOP_K_VALUE}' not in scoring:
     print(f"Warning: Custom scorer 'card_precision@{TOP_K_VALUE}' was not added to the scoring dict. Check for creation errors.")
     # You might want to stop if this metric is critical
     # raise SystemExit("Stopping: CP@k scorer is missing but required.")

# List versions for accessing results later and for logging (Aligned EXACTLY with original cell 515762f3dad6410da1375f633164bd06)
performance_metrics_list_grid = list(scoring.keys()) # Keys used in GridSearchCV
performance_metrics_list = ['AUC ROC', 'Average precision'] # Display names corresponding to keys
if f'card_precision@{TOP_K_VALUE}' in scoring:
    performance_metrics_list.append(f'Card Precision@{TOP_K_VALUE}')

# Define primary metric for model selection (Aligned with original choice in cell 515762f3dad6410da1375f633164bd06)
primary_metric = 'Average precision' # Original notebook used Average Precision
primary_metric_grid_key = 'average_precision' # Key used in scoring dict

print(f"Validation Strategy: Prequential with {N_FOLDS} folds")
print(f"Performance Metrics for GridSearch: {performance_metrics_list_grid}")
print(f"Performance Metrics for Reporting: {performance_metrics_list}")
print(f"Primary Metric for Selection: {primary_metric}")

# Log scoring setup
mlflow.log_param("scoring_metrics_grid", json.dumps(performance_metrics_list_grid))
mlflow.log_param("scoring_metrics_report", json.dumps(performance_metrics_list))
mlflow.log_param("primary_selection_metric", primary_metric)



### 4d. Train & Assess Candidates via Prequential Validation (Aligned with Original)

*(MLOps Note: This section iterates through each classifier type. For each, it runs the `model_selection_wrapper` (copied from original) which performs `prequential_grid_search` (copied from original) for both validation and test estimation periods using the original dates and parameters. Results are stored and logged to MLflow within nested runs.)*

In [None]:
# Cell ID: Z_hNgWoIDSZI_aligned
# Section 4d: Train & Assess Candidates via Prequential Validation (Aligned with Original)

performances_df_dictionary = {}
model_selection_times = {}
total_selection_start_time = time.time()

if transactions_df.empty or not scoring:
    print("ERROR: Skipping model selection as transactions_df is empty or scoring dictionary is not defined.")
    mlflow.set_tag("Model Selection Status", "Skipped - No Data or Scoring")
    # Stop execution if this step is critical
    raise SystemExit("Stopping: Cannot perform model selection.")
else:
    for classifier_name in classifiers_dictionary:
        print(f"\n===== Running Model Selection for: {classifier_name} ====")

        # Start a nested run for this classifier's grid search
        with mlflow.start_run(run_name=f"GridSearch_{classifier_name}_Aligned", nested=True) as nested_run:
            nested_run_id = nested_run.info.run_id
            print(f"  Starting Nested MLflow Run ID: {nested_run_id}")
            mlflow.set_tag("Classifier", classifier_name)
            # Log the specific parameter grid being searched
            try:
                 # Use the make_serializable helper if defined earlier, otherwise basic conversion
                 def make_serializable(val):
                     if isinstance(val, list): return [make_serializable(item) for item in val]
                     if val is None: return 'None'
                     return val
                 loggable_params = {k: make_serializable(v) for k, v in parameters_dictionary[classifier_name].items()}
                 mlflow.log_param("parameter_grid", json.dumps(loggable_params))
            except Exception as log_e:
                 print(f"  Warning: Could not log parameter grid for {classifier_name}: {log_e}")

            classifier = classifiers_dictionary[classifier_name]
            parameters = parameters_dictionary[classifier_name]

            start_time_clf = time.time()

            try:
                # Call the wrapper function COPIED from the original notebook
                # Ensure model_selection_wrapper uses the corrected prequential_grid_search
                performances_df = model_selection_wrapper(transactions_df, classifier,
                                                          INPUT_FEATURES, OUTPUT_FEATURE,
                                                          parameters, scoring, # Use the EXACT grids/scoring from original
                                                          start_date_training_for_valid, # Use the EXACT start date from original
                                                          start_date_training_for_test_estimation, # Use the EXACT start date from original
                                                          n_folds=N_FOLDS,
                                                          delta_train=DELTA_TRAIN,
                                                          delta_delay=DELTA_DELAY,
                                                          delta_assessment=DELTA_ASSESSMENT,
                                                          performance_metrics_list_grid=performance_metrics_list_grid,
                                                          performance_metrics_list=performance_metrics_list,
                                                          n_jobs=5 # Use multiple cores; original used 5
                                                         )

                clf_execution_time = time.time()-start_time_clf
                model_selection_times[classifier_name] = clf_execution_time
                print(f"  Finished {classifier_name} selection in {clf_execution_time:.2f} seconds")
                mlflow.log_metric("model_selection_time_sec", clf_execution_time)

                if performances_df.empty:
                    print(f"  Warning: No performance results generated for {classifier_name} by model_selection_wrapper.")
                    mlflow.set_tag("Grid Search Status", "Completed - No Results")
                    performances_df_dictionary[classifier_name] = pd.DataFrame() # Store empty df
                else:
                    # Ensure 'Parameters summary' column exists after wrapper call
                    if 'Parameters summary' not in performances_df.columns:
                        print("  Warning: 'Parameters summary' column missing after model_selection_wrapper. Cannot log detailed results correctly.")
                        # Attempt to create it if 'Parameters' column exists
                        if 'Parameters' in performances_df.columns:
                             def params_to_str_fallback(params):
                                 try:
                                     # Ensure params is a dict before processing
                                     if isinstance(params, dict):
                                         items = [f"{k.split('__')[1]}={v}" for k, v in sorted(params.items())]
                                         return ", ".join(items)
                                     else:
                                         return str(params)
                                 except Exception: # Catch potential errors during string formatting
                                     return str(params)
                             performances_df['Parameters summary'] = performances_df['Parameters'].apply(params_to_str_fallback)
                             print("  Created fallback 'Parameters summary' column.")
                        else:
                             mlflow.set_tag("Grid Search Status", "Completed - Missing Param Summary")
                    else:
                        mlflow.set_tag("Grid Search Status", "Completed - Success")

                    # Log detailed results per hyperparameter combination as artifact
                    perf_artifact_path = f"{classifier_name}_grid_search_results_aligned.csv"
                    try:
                        performances_df.to_csv(perf_artifact_path, index=False)
                        mlflow.log_artifact(perf_artifact_path)
                        print(f"  Logged detailed grid search results to {perf_artifact_path}")
                    except Exception as art_e:
                         print(f"  Warning: Failed to log artifact {perf_artifact_path}: {art_e}")

                # Store potentially modified df (with fallback summary col)
                performances_df_dictionary[classifier_name]=performances_df

            except SystemExit:
                print(f"SystemExit occurred during model selection for {classifier_name}. Stopping.")
                raise # Re-raise to stop the notebook
            except Exception as e:
                print(f"ERROR during model selection wrapper for {classifier_name}: {e}")
                import traceback
                traceback.print_exc()
                mlflow.set_tag("Grid Search Status", f"Failed - Exception: {e}")
                # Store empty df to avoid errors later
                performances_df_dictionary[classifier_name] = pd.DataFrame()

# Cell ID: Z_hNgWoIDSZI_aligned (End of the cell)

print(f"\nTotal Model Selection Time: {(time.time()-total_selection_start_time):.2f} seconds")
mlflow.log_metric("total_model_selection_time_sec", time.time()-total_selection_start_time)

# Check if any grid search failed before marking as completed
print("Checking status of nested grid search runs...")
all_statuses = []
try:
    # Fetch ALL runs under the parent first
    nested_runs = mlflow.search_runs(
        experiment_ids=[mlflow.active_run().info.experiment_id],
        filter_string=f"tags.mlflow.parentRunId = '{run_id}'", # Only filter by parent ID
        output_format="list"
    )

    # Filter in Python for the correct run names and get their status
    all_statuses = [
        run.data.tags.get("Grid Search Status", "Unknown")
        for run in nested_runs
        if run.data.tags.get("mlflow.runName", "").startswith("GridSearch_") # Filter names here
    ]
    print(f"Found {len(all_statuses)} relevant nested runs with statuses: {all_statuses}")

except Exception as search_e:
    print(f"ERROR searching for nested runs: {search_e}")
    # Mark parent status as unknown or failed due to inability to check children
    mlflow.set_tag("Model Selection Status", "Unknown - Failed to Check Nested Runs")

# Set parent status based on children (only if statuses were retrieved)
if all_statuses:
    if any("Failed" in status for status in all_statuses):
        mlflow.set_tag("Model Selection Status", "Completed with Failures")
    elif all(status == "Completed - Success" or status.startswith("Completed - ") for status in all_statuses): # Be more lenient for success variations
        mlflow.set_tag("Model Selection Status", "Completed Successfully")
    else: # Handle cases like "No Results" or missing summaries
         mlflow.set_tag("Model Selection Status", "Completed with Issues/Warnings")
elif 'nested_runs' in locals() and not nested_runs: # Search succeeded but found no runs
    mlflow.set_tag("Model Selection Status", "Completed - No Nested Runs Found")
# Else: Status already set if search failed


### 4d.1 Review Execution Times

In [None]:
if model_selection_times:
    exec_times_df = pd.DataFrame.from_dict(model_selection_times, orient='index', columns=['Total Selection Time (s)'])
    exec_times_df = exec_times_df.reset_index().rename(columns={'index': 'Model'})
    print("\nModel Selection Execution Times:")
    display(exec_times_df)
    # Log this summary table as an artifact
    exec_times_path = "model_selection_times_aligned.csv"
    try:
        exec_times_df.to_csv(exec_times_path, index=False)
        mlflow.log_artifact(exec_times_path)
        print(f"Execution times summary logged to {exec_times_path}")
    except Exception as e:
         print(f"Warning: Failed to log execution times artifact: {e}")
else:
    print("No execution times recorded (model selection likely skipped or failed).")

### 4d.2 Review Performance Summaries and Plots (Aligned with Original)

*(MLOps Note: Generate summaries and plots using functions copied from original. Plots are logged to MLflow.)*

In [None]:
all_summaries = {}
for model_name, perf_df in performances_df_dictionary.items():
    print(f"\n===== Summary for {model_name} (Aligned) ====")
    if not perf_df.empty:
        # Ensure 'Parameters summary' column exists before calling summary function
        if 'Parameters summary' not in perf_df.columns:
             print(f"  Error: 'Parameters summary' column missing for {model_name}. Cannot generate summary or plots.")
             continue
             
        # Use the get_summary_performances function COPIED from the original notebook
        summary = get_summary_performances(perf_df, parameter_column_name="Parameters summary")
        display(summary)
        all_summaries[model_name] = summary
        
        # Generate and log performance plots using function COPIED from original
        print("\n  Performance Plots:")
        fig_perf = get_performances_plots(perf_df,
                                          performance_metrics_list=performance_metrics_list, # Use the aligned list
                                          parameter_name="Parameters summary", # Use the correct column name
                                          summary_performances=summary # Pass summary for vline
                                         )
        if fig_perf:
            try:
                plt.show() # Show plot in notebook
                mlflow.log_figure(fig_perf, f"plots/{model_name}_performance_curves_aligned.png")
                plt.close(fig_perf) # Close figure to free memory
                print(f"  Logged performance plots for {model_name}.")
            except Exception as e:
                 print(f"  Warning: Failed to log performance plot for {model_name}: {e}")
        else:
            print(f"  Could not generate performance plots for {model_name}.")
            
        # Generate and log execution time plot using function COPIED from original
        fig_time = get_execution_times_plot(perf_df, 
                                            title=f"{model_name} Mean Fit Time (Aligned)", 
                                            parameter_name="Parameters summary") # Use correct column name
        if fig_time:
            try:
                plt.show()
                mlflow.log_figure(fig_time, f"plots/{model_name}_execution_time_aligned.png")
                plt.close(fig_time)
                print(f"  Logged execution time plot for {model_name}.")
            except Exception as e:
                 print(f"  Warning: Failed to log execution time plot for {model_name}: {e}")
        else:
            print(f"  Could not generate execution time plot for {model_name}.")
            
    else:
        print(f"  No performance data available for {model_name} to display or plot.")

# Log all summaries combined as an artifact (optional)
if all_summaries:
    combined_summary_path = "all_model_summaries_aligned.json"
    try:
        # Convert DataFrames to dicts for JSON serialization
        serializable_summaries = {k: v.to_dict() for k, v in all_summaries.items()}
        with open(combined_summary_path, 'w') as f:
            json.dump(serializable_summaries, f, indent=4)
        mlflow.log_artifact(combined_summary_path)
        print(f"\nCombined model summaries logged to {combined_summary_path}")
    except Exception as e:
         print(f"Warning: Failed to log combined summaries artifact: {e}")

### 4d.3 Compare Model Performances (Aligned with Original)

*(MLOps Note: Generate comparison plot using function copied from original and log it.)*

### 4e. Select Best Model and Hyperparameters (Aligned with Original)

_(MLOps Note: Select the best model based on the primary validation metric (`Average precision`) using summaries generated by the original function. Log the chosen model name, its parameters, and the corresponding validation score.)*

In [None]:
best_model_name = None
best_model_params_dict = None # The actual dictionary for set_params
best_model_params_summary_str = None # The string summary for reference
best_validation_score = -np.inf

# Use the primary metric defined earlier (aligned with original)
print(f"\n===== Selecting Best Model based on Validation {primary_metric} (Aligned) ====")

if not all_summaries:
    print("ERROR: Cannot select best model: No summary data available from get_summary_performances.")
    mlflow.set_tag("Best Model Selection Status", "Failed - No Summaries")
else:
    for model_name, summary_df in all_summaries.items():
        # Check if the primary metric column exists in the summary generated by the original function
        if primary_metric in summary_df.columns:
            # Extract the validation score and parameter string from the summary
            validation_perf_str = summary_df.loc['Validation performance', primary_metric]
            best_param_summary_str_candidate = summary_df.loc['Best estimated parameters', primary_metric]

            # Check if validation performance is valid ('N/A' or 'NaN' indicate issues)
            if isinstance(validation_perf_str, str) and validation_perf_str not in ['N/A', 'NaN']:
                try:
                    # Extract the numeric score part
                    current_validation_score = float(validation_perf_str.split('+/-')[0])
                    print(f"  {model_name}: Validation {primary_metric} = {current_validation_score:.4f} (Params Summary: {best_param_summary_str_candidate})")

                    # Compare with the current best score
                    if current_validation_score > best_validation_score:
                        best_validation_score = current_validation_score
                        best_model_name = model_name
                        best_model_params_summary_str = best_param_summary_str_candidate
                        
                        # Find the full parameter dict corresponding to the best summary string
                        # Need the original performances_df for this model
                        perf_df = performances_df_dictionary.get(model_name)
                        if perf_df is not None and 'Parameters summary' in perf_df.columns and 'Parameters' in perf_df.columns:
                            best_row = perf_df[perf_df['Parameters summary'] == best_param_summary_str_candidate]
                            if not best_row.empty:
                                best_model_params_dict = best_row['Parameters'].iloc[0]
                                if not isinstance(best_model_params_dict, dict):
                                     print(f"  Warning: Retrieved best parameters for {model_name} is not a dictionary: {best_model_params_dict}")
                                     best_model_params_dict = None # Reset if not dict
                            else:
                                print(f"  Warning: Could not find parameter dictionary for best summary string '{best_param_summary_str_candidate}' in {model_name}'s performance df.")
                                best_model_params_dict = None # Reset if dict not found
                        else:
                             print(f"  Warning: Could not retrieve performance df or required columns for {model_name} to find best params dict.")
                             best_model_params_dict = None # Reset if cannot find dict
                             
                except ValueError as ve:
                    print(f"  {model_name}: Could not parse validation score '{validation_perf_str}': {ve}")
                except Exception as e:
                     print(f"  Error processing {model_name} during selection: {e}")
            else:
                # Handles 'N/A' or 'NaN'
                print(f"  {model_name}: Validation {primary_metric} = {validation_perf_str} (Skipped for best model comparison)")
        else:
            print(f"  {model_name}: Primary metric '{primary_metric}' not found in summary columns.")

    # Log results of selection
    if best_model_name and best_model_params_dict is not None:
        print(f"\nSelected Best Model: {best_model_name}")
        print(f"Best Validation {primary_metric}: {best_validation_score:.4f}")
        print(f"Best Hyperparameters Dict: {best_model_params_dict}")
        print(f"Best Hyperparameters Summary Str: {best_model_params_summary_str}")
        mlflow.set_tag("best_model_name", best_model_name)
        mlflow.log_metric(f"best_validation_{primary_metric_grid_key}", best_validation_score)
        # Log the best parameters (remove 'clf__' prefix for clarity in MLflow UI)
        try:
            final_params_to_log = {k.split('__', 1)[1]: make_serializable(v) for k, v in best_model_params_dict.items()}
            mlflow.log_params(final_params_to_log)
            mlflow.set_tag("Best Model Selection Status", "Success")
        except Exception as log_e:
             print(f"Warning: Could not log best parameters: {log_e}")
             mlflow.set_tag("Best Model Selection Status", "Success - Param Logging Failed")
             
    else:
        # Fallback logic similar to original if no clear winner
        print(f"\nCould not determine the best model based on validation {primary_metric}.")
        # Original defaulted to XGBoost
        print("Defaulting to XGBoost with its best validated parameters (if available) or library defaults.")
        mlflow.set_tag("Best Model Selection Status", "Failed - Defaulting to XGBoost")
        best_model_name = 'XGBoost'
        best_model_params_dict = None # Reset
        best_model_params_summary_str = 'Default Fallback'
        
        # Try to get XGBoost's best params from its summary
        if 'XGBoost' in all_summaries and primary_metric in all_summaries['XGBoost'].columns:
             xgb_summary = all_summaries['XGBoost']
             best_param_summary_str_xgb = xgb_summary.loc['Best estimated parameters', primary_metric]
             xgb_perf_df = performances_df_dictionary.get('XGBoost')
             if xgb_perf_df is not None and 'Parameters summary' in xgb_perf_df.columns and 'Parameters' in xgb_perf_df.columns:
                  best_row_xgb = xgb_perf_df[xgb_perf_df['Parameters summary'] == best_param_summary_str_xgb]
                  if not best_row_xgb.empty:
                       best_model_params_dict = best_row_xgb['Parameters'].iloc[0]
                       best_model_params_summary_str = best_param_summary_str_xgb
                       print(f"Using best validated XGBoost params: {best_model_params_summary_str}")
                  
        # If still no params, use the library defaults defined earlier
        if best_model_params_dict is None:
            print("Warning: Could not find best validated XGBoost params. Using library defaults defined in parameters_dictionary.")
            best_model_params_dict = parameters_dictionary['XGBoost']
            # Create summary string for defaults
            try:
                 best_model_params_summary_str = ", ".join([f"{k.split('__')[1]}={v}" for k, v in sorted(best_model_params_dict.items())])
            except:
                 best_model_params_summary_str = str(best_model_params_dict)
                 
        # Log the fallback choice
        mlflow.set_tag("best_model_name", best_model_name)
        if best_model_params_dict:
            try:
                final_params_to_log = {k.split('__', 1)[1]: make_serializable(v) for k, v in best_model_params_dict.items()}
                mlflow.log_params(final_params_to_log)
            except Exception as log_e:
                 print(f"Warning: Could not log default XGBoost parameters: {log_e}")



## 5. Train Final Model (Aligned with Original)

Train the selected model (`best_model_name`) with its `best_model_params_dict` on the full final training set (`train_df_final`) using the **exact dates and data split** from the original notebook.

*(MLOps Note: The fitted pipeline (including scaler and classifier with best parameters) will be logged to MLflow and registered in Azure ML.)*

In [None]:
# Cell ID: l7jnHHsmDSZL_aligned
# Section 5: Train Final Model (Aligned with Original)

final_pipeline = None # Initialize

if best_model_name and best_model_params_dict is not None:
    print(f"\n===== Training Final Model: {best_model_name} (Aligned) ====")

    if train_df_final.empty:
        print("ERROR: Final training data (train_df_final) is empty. Cannot train final model.")
        mlflow.set_tag("Final Model Training Status", "Failed - Empty Train Data")
        # Stop execution
        raise SystemExit("Stopping: Cannot train final model on empty data.")
    else:
        # 1. Get the base classifier instance - Clone to avoid modifying the dictionary
        try:
             final_classifier_base = sklearn.base.clone(classifiers_dictionary[best_model_name])
        except Exception as clone_e:
             print(f"ERROR: Could not clone classifier '{best_model_name}': {clone_e}")
             raise SystemExit("Stopping: Classifier cloning failed.")

        # 2. Prepare the final parameters (remove 'clf__' prefix)
        final_params = {k.split('__', 1)[1]: v for k, v in best_model_params_dict.items() if k.startswith('clf__')}

        # 3. Set the parameters on the classifier instance, filtering invalid ones
        try:
            valid_params_keys = final_classifier_base.get_params().keys()
            final_params_filtered = {k: v for k, v in final_params.items() if k in valid_params_keys}
            # Handle potential None strings coming from JSON logging/retrieval if applicable
            for k, v in final_params_filtered.items():
                if isinstance(v, str) and v.lower() == 'none':
                    final_params_filtered[k] = None
            print(f"  Using parameters: {final_params_filtered}")
            final_classifier_base.set_params(**final_params_filtered)
            mlflow.set_tag("Final Model Params Used", "Best Validated")
        except Exception as e:
            print(f"  Warning: Error setting parameters for {best_model_name}: {e}")
            print("  Proceeding with default parameters for the final model.")
            # Reset to default if params cause error
            final_classifier_base = sklearn.base.clone(classifiers_dictionary[best_model_name])
            mlflow.set_tag("Final Model Params Used", "Defaults due to error")

        # 4. Create the final pipeline (Scaler + Classifier)
        final_pipeline = sklearn.pipeline.Pipeline([
            ('scaler', sklearn.preprocessing.StandardScaler()),
            ('clf', final_classifier_base)
        ])

        # 5. Fit the final pipeline on the final training data
        print(f"  Fitting final model on training data shape: {train_df_final.shape}")
        start_fit_time = time.time()

        # Handle potential NaNs in the final training data BEFORE fitting
        # **ALIGNED WITH ORIGINAL NOTEBOOK's Step 3 (cell l7jnHHsmDSZL): Impute with mean**
        train_df_final_imputed = train_df_final.copy()

        # *** CORRECTED NaN CHECK HERE ***
        if train_df_final_imputed[INPUT_FEATURES].isnull().values.any():
            print("  Warning: NaNs detected in final training data. Imputing with mean (matching original Step 3 logic).")
            # Fit imputer on training data only
            imputer_final_train = sklearn.impute.SimpleImputer(strategy='mean')
            train_df_final_imputed[INPUT_FEATURES] = imputer_final_train.fit_transform(train_df_final_imputed[INPUT_FEATURES])
            mlflow.set_tag("Final Train Imputation", "Mean Imputed")
            # Save the imputer if needed for consistent test imputation later
            # joblib.dump(imputer_final_train, 'final_train_imputer.joblib')
        else:
             mlflow.set_tag("Final Train Imputation", "Not Required")
        # *** END CORRECTION ***

        try:
            final_pipeline.fit(train_df_final_imputed[INPUT_FEATURES], train_df_final_imputed[OUTPUT_FEATURE])
            final_fit_time = time.time() - start_fit_time
            print(f"  Final model fitting completed in {final_fit_time:.2f} seconds.")
            mlflow.log_metric("final_model_train_time_sec", final_fit_time)
            mlflow.set_tag("Final Model Training Status", "Success")

            # 6. Log the fitted pipeline (model) to MLflow & Register in AML
            print(f"  Logging final pipeline to MLflow and registering as '{MODEL_NAME_AML}'...")
            # Define conda environment for the model
            try:
                sklearn_version = sklearn.__version__
                xgboost_version = xgboost.__version__
                pandas_version = pd.__version__
                numpy_version = np.__version__
                mlflow_version = mlflow.__version__
            except AttributeError:
                 print("Warning: Could not get all library versions for conda env.")
                 sklearn_version = "1.1.3" 
                 xgboost_version = "1.7.5" 
                 pandas_version = "1.5.3" 
                 numpy_version = "1.23.5"
                 mlflow_version = "2.9.2" 

            conda_env = {
                'channels': ['defaults', 'conda-forge'],
                'dependencies': [
                    f'python={sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}',
                    'pip',
                    {
                        'pip': [
                            f'mlflow=={mlflow_version}',
                            f'scikit-learn=={sklearn_version}',
                            f'xgboost=={xgboost_version}',
                            f'pandas=={pandas_version}',
                            f'numpy=={numpy_version}',
                            'cloudpickle' 
                        ],
                    },
                ],
                'name': 'mlflow-env'
            }

            # Log the model
            mlflow.sklearn.log_model(
                sk_model=final_pipeline,
                artifact_path="model", # Standard path within the run artifacts
                conda_env=conda_env,
                registered_model_name=MODEL_NAME_AML # Register in AML Model Registry
            )
            print(f"  Model logged and registered as '{MODEL_NAME_AML}' in Azure ML.")
            mlflow.set_tag("Final Model Logging Status", "Success")

        except SystemExit: # Catch potential exits from within fit/log
             print("SystemExit during final model training/logging.")
             raise
        except Exception as e:
            print(f"  ERROR fitting or logging the final model: {e}")
            import traceback
            traceback.print_exc()
            mlflow.set_tag("Final Model Training Status", f"Failed - Exception: {e}")
            mlflow.set_tag("Final Model Logging Status", "Failed")
            final_pipeline = None # Ensure pipeline is None if fitting failed
            raise SystemExit("Stopping: Final model training failed.")

else:
    print("\nERROR: Skipping final model training as no best model was selected or parameters are missing.")
    mlflow.set_tag("Final Model Training Status", "Skipped - No Best Model Selected")
    raise SystemExit("Stopping: Cannot proceed without a selected model.")




## 6. Evaluate Final Model (Aligned with Original)

Evaluate the performance of the final trained model on the unseen test set (`test_df_final`) using the **exact data split and evaluation functions** from the original notebook.

*(MLOps Note: Log the evaluation metrics calculated on the test set to MLflow.)*

In [None]:


print(f"\n===== Evaluating Final Model on Unseen Test Set (Aligned) ====")

if final_pipeline is None:
    print("ERROR: Skipping evaluation: Final pipeline was not trained successfully.")
    mlflow.set_tag("Final Model Evaluation Status", "Skipped - No Trained Pipeline")
    # If training failed, we likely should stop
    raise SystemExit("Stopping: Cannot evaluate as final pipeline is not available.")

elif test_df_final.empty:
    print("Warning: Skipping evaluation: Final test set (test_df_final) is empty.")
    mlflow.set_tag("Final Model Evaluation Status", "Skipped - Empty Test Set")
    # Depending on requirements, an empty test set might be ok or an error.
    # For now, we allow proceeding but flag it.

else:
    print(f"  Evaluating on test data shape: {test_df_final.shape}")
    start_pred_time = time.time()

    # Handle potential NaNs in the final test data BEFORE predicting
    # **ALIGNED WITH ORIGINAL NOTEBOOK's Step 3 logic (cell l7jnHHsmDSZL) for imputation, applied to TEST set**
    # Ideally, use the imputer fitted on the training data. For strict alignment with original's simpler logic, refit on test.
    test_df_final_imputed = test_df_final.copy()

    # *** CORRECTED NaN CHECK HERE ***
    if test_df_final_imputed[INPUT_FEATURES].isnull().values.any():
        print("  Warning: NaNs detected in final test data. Imputing with mean (matching original Step 3 logic applied to test).")
        # Refit imputer on test data - less ideal but matches original implicit handling
        imputer_final_test = sklearn.impute.SimpleImputer(strategy='mean')
        test_df_final_imputed[INPUT_FEATURES] = imputer_final_test.fit_transform(test_df_final_imputed[INPUT_FEATURES])
        mlflow.set_tag("Final Test Imputation", "Mean Imputed (Test Fit)")
        # If final_train_imputer was saved:
        # try:
        #    imputer_final_train = joblib.load('final_train_imputer.joblib')
        #    test_df_final_imputed[INPUT_FEATURES] = imputer_final_train.transform(test_df_final_imputed[INPUT_FEATURES])
        #    mlflow.set_tag("Final Test Imputation", "Mean Imputed (Train Fit)")
        # except Exception as imp_e:
        #    print(f"Could not load train imputer ({imp_e}), refitting on test.")
        #    # ... refit logic ...
    else:
        mlflow.set_tag("Final Test Imputation", "Not Required")
    # *** END CORRECTION ***

    # Predict probabilities on the (potentially imputed) final test set
    try:
        final_predictions_test = final_pipeline.predict_proba(test_df_final_imputed[INPUT_FEATURES])[:, 1]
        final_pred_time = time.time() - start_pred_time
        print(f"  Final model prediction completed in {final_pred_time:.2f} seconds.")
        mlflow.log_metric("final_model_predict_time_sec", final_pred_time)

        # Add predictions to the ORIGINAL final test dataframe for evaluation using the original function
        test_df_final_eval = test_df_final.copy()
        test_df_final_eval['predictions'] = final_predictions_test

        # Assess performance on the final test set using function COPIED from original
        final_performance_metrics = performance_assessment(test_df_final_eval,
                                                           output_feature=OUTPUT_FEATURE,
                                                           prediction_feature='predictions',
                                                           top_k_list=[TOP_K_VALUE], # Use the same k as original
                                                           rounded=False) # Get raw values for logging

        print("\n  Final Model Performance on Unseen Test Set (Aligned):")
        display(final_performance_metrics.round(4)) # Display rounded for readability

        # Log final metrics to MLflow
        auc_roc_test = final_performance_metrics['AUC ROC'].iloc[0]
        ap_test = final_performance_metrics['Average precision'].iloc[0]
        # Use .get with a default Series containing NaN to safely access CP@k
        cp_at_k_test = final_performance_metrics.get(f'Card Precision@{TOP_K_VALUE}', pd.Series([np.nan])).iloc[0]

        # Log only if metric is not NaN
        if not pd.isna(auc_roc_test):
            mlflow.log_metric("final_test_auc_roc", auc_roc_test)
        if not pd.isna(ap_test):
            mlflow.log_metric("final_test_average_precision", ap_test)
        if not pd.isna(cp_at_k_test):
             mlflow.log_metric(f"final_test_card_precision_at_{TOP_K_VALUE}", cp_at_k_test)

        mlflow.set_tag("Final Model Evaluation Status", "Success")

        # Log the performance summary dataframe as an artifact
        final_perf_path = "final_model_test_performance_aligned.csv"
        try:
            final_performance_metrics.round(5).to_csv(final_perf_path, index=False)
            mlflow.log_artifact(final_perf_path)
            print(f"  Final performance metrics logged to MLflow and saved to {final_perf_path}")
        except Exception as art_e:
            print(f"  Warning: Failed to log performance artifact {final_perf_path}: {art_e}")

    except Exception as e:
        print(f"  ERROR during final model prediction or evaluation: {e}")
        import traceback
        traceback.print_exc()
        mlflow.set_tag("Final Model Evaluation Status", f"Failed - Exception: {e}")
        # Raise error to stop notebook if evaluation fails critically
        raise SystemExit(f"Stopping: Error during final model evaluation: {e}")


## 7. Optional: Save Final Model Locally

*(MLOps Note: `mlflow.sklearn.log_model` already saved the model in MLflow format and registered it. This step saves a local copy using joblib, similar to the original notebook's potential last step, mainly for local verification or backup.)*

In [None]:
# Define path for local saving (within notebook context or defined output dir)
# Use the best model name determined earlier
local_model_filename = f"{best_model_name or 'default'}_final_pipeline_aligned_local.pkl"
local_model_save_path = os.path.join(MODEL_OUTPUT_DIR, local_model_filename)

if final_pipeline is not None:
    try:
        print(f"\nAttempting to save final pipeline locally to: {local_model_save_path}")
        joblib.dump(final_pipeline, local_model_save_path)
        print(f"  Final pipeline saved locally successfully.")
        # Optionally log this local file as well, though the MLflow format is preferred
        # mlflow.log_artifact(local_model_save_path, artifact_path="local_model_backup")
        mlflow.set_tag("Local Model Save Status", "Success")
    except Exception as e:
        print(f"  Error saving the final model locally: {e}")
        mlflow.set_tag("Local Model Save Status", f"Failed: {e}")
else:
    print("\nSkipping local model saving as final pipeline is not available.")
    mlflow.set_tag("Local Model Save Status", "Skipped - No Pipeline")

## 8. End MLflow Run

In [None]:
# --- End Parent MLflow Run ---
mlflow.end_run()
print(f"\nFinished and closed MLflow Run ID: {run_id}")

## 9. Optional: Verify Locally Saved Model

Load the locally saved `.pkl` file and make predictions on the test set to ensure it matches the results obtained before saving.

In [None]:
# Optional: Code to load back the locally saved model and test (for verification)
print("\n===== Verifying Locally Saved Model (Optional) ====")
if 'local_model_save_path' in locals() and os.path.exists(local_model_save_path):
    try:
        loaded_local_model = joblib.load(local_model_save_path)
        print(f"  Model loaded successfully from: {local_model_save_path}")
        
        # Ensure test data is available and handle imputation consistently
        if not test_df_final.empty:
            print("  Testing the loaded local model on unseen test data...")
            test_df_verify_imputed = test_df_final.copy()
            if test_df_verify_imputed[INPUT_FEATURES].isNone().values.any():
                 print("  Warning: NaNs detected in verification test data. Imputing with mean (Test Fit).")
                 imputer_verify = sklearn.impute.SimpleImputer(strategy='mean')
                 test_df_verify_imputed[INPUT_FEATURES] = imputer_verify.fit_transform(test_df_verify_imputed[INPUT_FEATURES])
            
            # Predict using loaded model
            local_predictions = loaded_local_model.predict_proba(test_df_verify_imputed[INPUT_FEATURES])[:, 1]
            
            # Create a DataFrame for evaluation
            test_df_verify_eval = test_df_final.copy()
            test_df_verify_eval['predictions'] = local_predictions
            
            # Assess the performance using the original function
            local_model_performance = performance_assessment(test_df_verify_eval,
                                                             output_feature=OUTPUT_FEATURE,
                                                             prediction_feature='predictions',
                                                             top_k_list=[TOP_K_VALUE],
                                                             rounded=True)
            print("\n  Loaded Local Model Performance on Test Set:")
            display(local_model_performance)
            
            # Compare with previously calculated final_performance_metrics if available
            if 'final_performance_metrics' in locals() and not final_performance_metrics.empty:
                 print("\n  Comparison with original final evaluation:")
                 comparison_df = pd.concat([
                     final_performance_metrics.round(4).rename(index={0:'Original Final Eval'}),
                     local_model_performance.round(4).rename(index={0:'Loaded Local Model Eval'})
                 ])
                 display(comparison_df)
            
        else:
            print("  Test dataset is empty. Cannot test the loaded local model.")
            
    except Exception as e:
        print(f"  Error loading or testing the local model: {e}")
        import traceback
        traceback.print_exc()
else:
    print(f"  Local model file not found at '{local_model_save_path}' or path not defined. Cannot verify.")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=68fe131d-9402-41f5-ab34-383fc691ce88' target="_blank">
<img alt='Created in deepnote.com' style='display:inline;max-height:16px;margin:0px;margin-right:7.5px;' src='image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iODBweCIgaGVpZ2h0PSI4MHB4IiB2aWV3Qm94PSIwIDAgODAgODAiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDU0LjEgKDc2NDkwKSAtIGh0dHBzOi8vc2tldGNoYXBwLmNvbSAtLT4KICAgIDx0aXRsZT5Hcm91cCAzPC90aXRsZT4KICAgIDxkZXNjPkNyZWF0ZWQgd2l0aCBTa2V0Y2guPC9kZXNjPgogICAgPGcgaWQ9IkxhbmRpbmciIHN0cm9rZT0ibm9uZSIgc3Ryb2tlLXdpZHRoPSIxIiBmaWxsPSJub25lIiBmaWxsLXJ1bGU9ImV2ZW5vZGQiPgogICAgICAgIDxnIGlkPSJBcnRib2FyZCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTEyMzUuMDAwMDAwLCAtNzkuMDAwMDAwKSI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0zIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgxMjM1LjAwMDAwMCwgNzkuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8cG9seWdvbiBpZD0iUGF0aC0yMCIgZmlsbD0iIzAyNjVCNCIgcG9pbnRzPSIyLjM3NjIzNzYyIDgwIDM4LjA0NzY2NjcgODAgNTcuODIxNzgyMiA3My44MDU3NTkyIDU3LjgyMTc4MjIgMzIuNzU5MjczOSAzOS4xNDAyMjc4IDMxLjY4MzE2ODMiPjwvcG9seWdvbj4KICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0zNS4wMDc3MTgsODAgQzQyLjkwNjIwMDcsNzYuNDU0OTM1OCA0Ny41NjQ5MTY3LDcxLjU0MjI2NzEgNDguOTgzODY2LDY1LjI2MTk5MzkgQzUxLjExMjI4OTksNTUuODQxNTg0MiA0MS42NzcxNzk1LDQ5LjIxMjIyODQgMjUuNjIzOTg0Niw0OS4yMTIyMjg0IEMyNS40ODQ5Mjg5LDQ5LjEyNjg0NDggMjkuODI2MTI5Niw0My4yODM4MjQ4IDM4LjY0NzU4NjksMzEuNjgzMTY4MyBMNzIuODcxMjg3MSwzMi41NTQ0MjUgTDY1LjI4MDk3Myw2Ny42NzYzNDIxIEw1MS4xMTIyODk5LDc3LjM3NjE0NCBMMzUuMDA3NzE4LDgwIFoiIGlkPSJQYXRoLTIyIiBmaWxsPSIjMDAyODY4Ij48L3BhdGg+CiAgICAgICAgICAgICAgICA8cGF0aCBkPSJNMCwzNy43MzA0NDA1IEwyNy4xMTQ1MzcsMC4yNTcxMTE0MzYgQzYyLjM3MTUxMjMsLTEuOTkwNzE3MDEgODAsMTAuNTAwMzkyNyA4MCwzNy43MzA0NDA1IEM4MCw2NC45NjA0ODgyIDY0Ljc3NjUwMzgsNzkuMDUwMzQxNCAzNC4zMjk1MTEzLDgwIEM0Ny4wNTUzNDg5LDc3LjU2NzA4MDggNTMuNDE4MjY3Nyw3MC4zMTM2MTAzIDUzLjQxODI2NzcsNTguMjM5NTg4NSBDNTMuNDE4MjY3Nyw0MC4xMjg1NTU3IDM2LjMwMzk1NDQsMzcuNzMwNDQwNSAyNS4yMjc0MTcsMzcuNzMwNDQwNSBDMTcuODQzMDU4NiwzNy43MzA0NDA1IDkuNDMzOTE5NjYsMzcuNzMwNDQwNSAwLDM3LjczMDQ0MDUgWiIgaWQ9IlBhdGgtMTkiIGZpbGw9IiMzNzkzRUYiPjwvcGF0aD4KICAgICAgICAgICAgPC9nPgogICAgICAgIDwvZz4KICAgIDwvZz4KPC9zdmc+' > </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>