# <center><b>Classification template</b></center>

## <b>Step 0:</b> Context
*Explain the problem you're trying to solve, the nature of the dataset, the goal of the model, and the impact of the solution.*

## <b>Step 1:</b> Imports and configurations

### <b>Libraries</b>
*Import all necessary Python libraries (e.g., numpy, pandas, scikit-learn, tensorflow/pytorch for deep learning).*

In [None]:
#- Workflow
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn

#- Data manipulation
import pandas as pd
import numpy as np

#- Data visualisation
import lux
import matplotlib.pyplot as plt
import seaborn as sns
import logging

#- Preprocessing for model fitting
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

#- ML models
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

#- Model assessement
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, roc_auc_score, roc_curve, confusion_matrix, auc, precision_recall_curve
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from scipy import stats


### <b>Dataset</b>
*If you have several data sets, you need to duplicate the cells below. If your dataset is in a different format, create a new cell and write the code to import it. Don't forget Chat GPT and Google are your friends. ;-)*

*If your data is in a CSV file use the cell below to import it.*

In [None]:
data_path = ".csv"
data = pd.read_csv(data_path)
raw_data = data.copy()
data.head(3)

*If your data is in an excel file use the cell below to import it instead.*

In [None]:
# data_path = ".xlsx"
# data = pd.read_excel(data_path)
# data.head(3)

### <b>Configurations</b>
*Set any global configurations such as random seed for reproducibility, backend settings for computation libraries, and any project-specific parameters.*

In [None]:
SEED = 42

#- Show all the column
pd.set_option('display.max_columns', None)

#- Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("test_mlflow_setup")      # This is just to test if the mlflow server is working
try:
    mlflow.log_param("test_param", "test_value")
    print("MLflow is configured correctly!")
except Exception as e:
    print(f"MLflow configuration failed: {e}")


## <b>Step 2:</b> Data inspection and cleanings

### <b>Inspection</b>
*The goal of data inspection is to understand the dataset's characteristics, quality, and any potential issues that need addressing before analysis.*

In [None]:
def basic_inspection(df, visualize_nulls=True, detailed_summary=True, check_duplicates=True):
    """
    Perform a basic inspection of a pandas DataFrame including size, data types,
    null values, a heatmap of missing values, duplicate rows, and a summary table with optional
    detailed analysis.

    Parameters:
    - df: pandas DataFrame.
    - visualize_nulls: Bool, if True, visualizes null values using a heatmap.
    - detailed_summary: Bool, if True, includes detailed analysis in the summary table.
    - check_duplicates: Bool, if True, checks and reports the number of duplicate rows.

    Returns:
    - A pandas DataFrame containing a summary of the inspection.
    """
    print(f'Number of rows: {df.shape[0]}\nNumber of columns: {df.shape[1]}\n')
    print('Data types:\n', df.dtypes)
    
    null_counts = df.isnull().sum()
    print('\nColumns with null values:\n', null_counts[null_counts > 0])

    if check_duplicates:
        duplicate_rows = df.duplicated().sum()
        print(f'\nNumber of duplicate rows: {duplicate_rows}\n')

    if visualize_nulls:
        plt.figure(figsize=(15, 7))
        sns.heatmap(df.isna(), cbar=True, cmap='viridis')
        plt.title('Heatmap of Missing Values')
        plt.show()

    summary_table = pd.DataFrame({
        "Unique_values": df.nunique(),
        "Data_type": df.dtypes,
        "Null_count": df.isnull().sum(),
        "Null_percentage": (df.isnull().sum() / df.shape[0] * 100).round(2)
    })

    if detailed_summary:
        if 'object' in df.dtypes.values:
            summary_table['Most_common'] = df.apply(lambda x: x.value_counts().idxmax() if x.dtypes == 'object' else 'unknown')
        for col in df.select_dtypes(include=['int64', 'float64']).columns:
            summary_table.loc[col, 'Mean'] = df[col].mean()
            summary_table.loc[col, 'Std'] = df[col].std()
            summary_table.loc[col, 'Min'] = df[col].min()
            summary_table.loc[col, 'Max'] = df[col].max()

    return summary_table

*For the following cell, modify the parameters according to the particularities of your database. For example, if you have intentional duplicates, it is not necessary to check them.*

In [None]:
basic_inspection(data, visualize_nulls=True, detailed_summary=True, check_duplicates=True)

In [None]:
def identify_outliers(df, plot=True):
    """
    Identify outliers in a pandas DataFrame using Z-score (values more than 3 standard deviations from 
    the mean) and Interquartile Range (IQR) methods (values below Q1 - 1.92*IQR or above Q3 + 1.92*IQR).
    Outliers identified by both methods are combined to provide a comprehensive overview of outliers 
    in each numeric feature.
    
    Parameters:
    - df (pd.DataFrame): A pandas DataFrame containing the data to be analyzed. Only numeric
      columns will be considered for outlier detection.
    
    Returns:
    - dict: A dictionary where each key is the name of a numeric feature in the DataFrame. Each value
      is another dictionary containing two keys: 'num_outliers', which is the number of unique outliers
      identified in the feature, and 'outliers_index', an array of indices of these outliers.
    """
    outlier_summary = {}
    
    for column in df.select_dtypes(include=np.number).columns:  # Focus on numeric columns
        # Calculate Z-scores
        z_scores = np.abs(stats.zscore(df[column].dropna()))
        z_outliers = np.where(z_scores > 3)[0]
        
        # Calculate IQR
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        iqr_outliers = df[(df[column] < (Q1 - 1.96 * IQR)) | (df[column] > (Q3 + 1.96 * IQR))].index
        
        # Combine unique outliers from both methods
        combined_outliers = np.union1d(z_outliers, iqr_outliers)
        
        # Optionally plot
        if plot:
            print(column)
            plt.figure(figsize=(8, 4))
            sns.boxplot(x=df[column])
            plt.title(f'Boxplot of {column} (Outliers highlighted)')
            plt.show()

        # Summary
        outlier_summary[column] = {
            'num_outliers': len(combined_outliers),
            'outliers_index': combined_outliers
        }
    
    return outlier_summary

*This function does not modify the original DataFrame. I recommend you to inspect the identified outliers and decide on appropriate handling methods such as removal, replacement, or keeping them as is, depending on the analysis requirements and domain knowledge.*

In [None]:
outlier_info = identify_outliers(data)
print(outlier_info)

### <b> Cleaning</b>
*Following the inspection, data cleaning aims to rectify issues identified, improving the dataset's quality and making it suitable for further analysis and modeling.*

In [None]:
data.head(3)

In [None]:
#- Feature Selection

# data.drop(column=[], inplace=True)

In [None]:
#- Remove duplicates

# data.drop_duplicates(inplace=True)

In [None]:
def data_cleaning(data):
    #- Handle missing values
    data['numerical_column'].fillna(data['numerical_column'].mean(), inplace=True)
    data['categorical_column'].fillna(data['categorical_column'].mode()[0], inplace=True)
    
    #- Correct data type
    data['numeric_column_as_string'] = pd.to_numeric(data['numeric_column_as_string'], errors='coerce')
    data['date_column'] = pd.to_datetime(data['date_column'])

    #- Binning
    #- Fixed width binning
    data['value_bin'] = pd.cut(data['column_to_bin'], bins=3, labels=["Low", "Medium", "High"])

    #- Qauntile based binning
    data['value_quantile_bin'] = pd.qcut(data['column_to_bin'], q=3, labels=["Low", "Medium", "High"])

    #- Custom binning
    bins = []
    data['value_custom_bin'] = pd.cut(data['column_to_bin'], bins=bins, labels=["Low", "Medium", "High"], right=False)

    #- Turn categorical variables into numerical variables

    #- Label encoding
    # Define a mapping of categories to numerical values
    checking_mapping = {'None': 0, 'little': 1, 'moderate': 2, 'rich': 3}
    # Map the categories to their numerical equivalents
    data['col_name'] = data['col_name_numb'].map(checking_mapping)

    #- One-hot encoding of categorical variables
    categorical_cols = []       # Add the columns of interest
    data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
    return data

In [None]:
data = data_cleaning(data)

<i>When deciding between one-hot encoding and simple encoding (label encoding) for categorical data, consider the nature of your categories:

- One-Hot Encoding: Use this when your categorical variable does not have a meaningful order or hierarchy. One-hot encoding creates a new binary column for each category, which is ideal for nominal data (e.g., color, city names). This approach prevents the model from assuming a natural ordering between categories, which is helpful to avoid misleading interpretations.

- Label Encoding: Use this when your categorical variable has a meaningful order or ranking (ordinal data). Label encoding assigns a unique integer to each category, preserving the order. However, be cautious, as this can introduce unintended ordinal relationships that may not exist in the data.</i>

Also don't apply the cleaning of the target feature in the data_cleaning function as we will apply this function to the prediction dataset (which won't have the target column) too. Do the transformation for the target column in the cell below

In [None]:
#- Handle missing values
data['numerical_column'].fillna(data['numerical_column'].mean(), inplace=True)
data['categorical_column'].fillna(data['categorical_column'].mode()[0], inplace=True)

#- Correct data type
data['numeric_column_as_string'] = pd.to_numeric(data['numeric_column_as_string'], errors='coerce')
data['date_column'] = pd.to_datetime(data['date_column'])

#- Binning
#- Fixed width binning
data['value_bin'] = pd.cut(data['column_to_bin'], bins=3, labels=["Low", "Medium", "High"])
#- Qauntile based binning
data['value_quantile_bin'] = pd.qcut(data['column_to_bin'], q=3, labels=["Low", "Medium", "High"])
#- Custom binning
bins = []
data['value_custom_bin'] = pd.cut(data['column_to_bin'], bins=bins, labels=["Low", "Medium", "High"], right=False)

#- Turn categorical variables into numerical variables
#- Label encoding
# Define a mapping of categories to numerical values
checking_mapping = {'None': 0, 'little': 1, 'moderate': 2, 'rich': 3}
# Map the categories to their numerical equivalents
data['col_name'] = data['col_name_numb'].map(checking_mapping)
#- One-hot encoding of categorical variables
categorical_cols = []       # Add the columns of interest
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [None]:
#- Outlier Treatment

# for col, info in outlier_info.items():
#     data = data.drop(index=info['outliers_index'], inplace=False)


In [None]:
data.head(3)

## <b>Step 3:</b> Exploratory Data Analysis (problem navigation)

### <b> Univariate analysis</b>

In [None]:
def quant_univariate_analysis(df, col_name, visualize=True, **kwargs):
    """
    Perform univariate analysis on a numerical column with visualization
    options using a Matplotlib native color palette focusing on red, yellow, and black.

    Parameters:
    df (DataFrame): Input DataFrame containing the data.
    col_name (str): Name of the numerical column to analyze.
    visualize (bool): Whether to visualize the analysis (default=True).
    **kwargs: Additional keyword arguments to customize the visualisation colors.

    Returns:
    pandas.Series: Descriptive statistics of the column.
    """
    if col_name not in df.columns:
        raise ValueError(f"Column '{col_name}' not found in the DataFrame.")

    # 1. Descriptive Statistics
    descriptive_stats = df[col_name].describe()
    print("\n"*2, col_name, "\n"*2, descriptive_stats)

    if visualize:
        # Custom color palette
        custom_colors = ['red', 'yellow', 'black']  # Red, Yellow, Black
        color = kwargs.get('hist_color', custom_colors[0])  # Use red as default
        
        # 2. Histogram with KDE
        plt.figure(figsize=kwargs.get('figsize', (12, 6)))
        sns.histplot(df[col_name], kde=True, bins=kwargs.get('bins', 30),
                     color=color,
                     kde_kws={'bw_adjust': kwargs.get('bw_adjust', 1)},
                     line_kws={'color': custom_colors[2], 'lw': 2})  # Use black for KDE line
        plt.title(f'Histogram and KDE of {col_name}')
        plt.xlabel(col_name)
        plt.ylabel('')
        plt.grid(True, linestyle='--', linewidth=0.5, color=custom_colors[1])  # Use yellow for grid lines
        plt.show()

        # 3. Boxplot
        plt.figure(figsize=kwargs.get('figsize', (12, 6)))
        sns.boxplot(x=df[col_name], color=kwargs.get('boxplot_color', custom_colors[1]))  # Use yellow for boxplot
        plt.title(f'Boxplot of {col_name}')
        plt.xlabel(col_name)
        plt.grid(True, linestyle='--', linewidth=0.5, color=custom_colors[2])  # Use black for grid lines
        plt.show()

    return descriptive_stats

In [None]:
for quant_col in []:        # Add in the brackets the name of the quantitative variables in your dataset that you want to visualize.
    quant_univariate_analysis(data, quant_col)

In [None]:
def qual_univariate_analysis(df, col_name, palette="viridis", show_grid=True, figsize=(12, 6)):
    """
    Performs and visualizes a univariate analysis for a qualitative (categorical) variable,
    highlighting and annotating the most and least common categories.

    Parameters:
    - df (DataFrame): The pandas DataFrame containing the data.
    - col_name (str): The name of the column to analyze.
    - palette (str, optional): Color palette for the plots. Defaults to 'viridis'.
    - show_grid (bool, optional): Whether to show the grid in the bar plot. Defaults to True.
    - figsize (tuple, optional): Figure size for the plots. Defaults to (12, 6).
    """
    
    # Frequency Table
    freq_table = df[col_name].value_counts()
    # Percentage Table
    percent_table = df[col_name].value_counts(normalize=True) * 100
    combined_table = pd.DataFrame({'Frequency': freq_table, 'Percentage': percent_table})
    print("\n"*2, col_name, "\n"*2, combined_table)

    # Identify most and least common categories
    most_common = freq_table.idxmax()
    least_common = freq_table.idxmin()

    # Bar Plot with Highlighting
    plt.figure(figsize=figsize)
    barplot = sns.countplot(y=df[col_name], order=freq_table.index, palette=palette)
    
    # Highlighting
    for patch in barplot.patches:
        if patch.get_y() == freq_table.index.get_loc(most_common):
            patch.set_facecolor('green')  # Highlight most common category in green
        elif patch.get_y() == freq_table.index.get_loc(least_common):
            patch.set_facecolor('red')  # Highlight least common category in red
    
    # Annotations
    plt.title(f'Bar Plot of {col_name}')
    plt.xlabel('Count')
    plt.ylabel(col_name)
    if show_grid:
        plt.grid(axis='x', linestyle='--', linewidth=0.5)
    
    # Adding annotations for the most and least common categories
    plt.text(freq_table.max(), freq_table.index.get_loc(most_common), 'Most common', fontsize=12, va='center')
    plt.text(freq_table.min(), freq_table.index.get_loc(least_common), 'Least common', fontsize=12, va='center')
    
    plt.show()

    return combined_table

In [None]:
for qual_col in []:         # Add in the brackets the name of the qualitative variables in your dataset that you want to visualize.
    qual_univariate_analysis(data, qual_col)

### <b> Bivariate analysis</b>

In [None]:
#- Calculate the correlation matrix
correlation_cols = []       # Add in the brackets the name of the the columns you want to visualize for correlation. Make sure these columns are numeric. You can numerize the qualitative columns back in the preprocessing step.
correlation = data[correlation_cols].corr(method='pearson')

#- Visualize the correlation matrix
fig, ax = plt.subplots()

ax.figure.set_size_inches(10, 10)
mask = np.triu(np.ones_like(correlation, dtype=bool))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(correlation, cmap=cmap, mask=mask, square=True, linewidths=.5, 
            annot=True, annot_kws={'size':14})

plt.show()

In [None]:
strong_corr = correlation[(correlation > 0.7) | (correlation < -0.7)]
print("Strong correlations:\n", strong_corr)

## <b>Step 4:</b> Feature Engineering
*The Feature engineering phase involves transforming raw data into meaningful features that effectively represent the underlying problem.*  
*<b>Feature Creation:</b>* Develop new features from the existing data to better capture the underlying patterns.  
*<b>Feature Transformation:</b>* Apply transformations (e.g., scaling, encoding) to make the data suitable for modeling.  
*<b>Feature Selection:</b>* Use statistical tests and selection algorithms to reduce dimensionality and focus on relevant features.

In [None]:
def do_feature_eng(df):
    return df

In [None]:
data = do_feature_eng(data)

## <b>Step 5:</b> Data preparation

### <b>Define Features and Target</b>

In [None]:
#- Features:
features_cols = []
X = data[features_cols]

#- Target:
y = data[target_name]       # Replace target_name with the name of your target variable in your dataset.

### <b>Scaling</b>

In [None]:
ss = StandardScaler()

### <b>Train validation and Test data</b>

In [None]:
pre_X_train, X_test, pre_y_train ,y_test = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(pre_X_train, pre_y_train, test_size=0.25, random_state=42, stratify=pre_y_train)

In [None]:
print("Size of the X training database: ", X_train.shape)
print("Size of the X validation database: ", X_val.shape)
print("Size of the X testing database: ", X_test.shape)
print("Size of the y training database: ", y_train.shape)
print("Size of the y validation database: ", y_val.shape)
print("Size of the y testing database: ", y_test.shape)

### <b>Dealing with inbalanced data</b>

In [None]:
smote = SMOTE(random_state=SEED)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

## <b>Step 6:</b> Training & evaluation

### <b>Fitting a base model</b>

In [None]:
#- Start a new MLflow run
mlflow.set_experiment("baseline_model")

In [None]:
with mlflow.start_run():
    #- Define the baseline model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=SEED, n_jobs=-1))
    ])

    # Fit the pipeline to training data
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    y_pred_proba = pipeline.predict_proba(X_val)[:, 1]  # For ROC and PR curves
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')
    cm = confusion_matrix(y_val, y_pred)
    
    print(f"\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}")
    
    # Log metrics with MLflow
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    
    # Generate ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color="blue")
    plt.plot([0, 1], [0, 1], linestyle="--", color="red")
    plt.title("ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.savefig("roc_curve.png")
    plt.show()
    
    # Log ROC Curve as artifact
    mlflow.log_artifact("roc_curve.png")
    
    # Generate Precision-Recall Curve
    precision_vals, recall_vals, _ = precision_recall_curve(y_val, y_pred_proba)
    
    plt.figure(figsize=(10, 6))
    plt.plot(recall_vals, precision_vals, label="PR Curve", color="green")
    plt.title("Precision-Recall Curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.grid(True)
    plt.savefig("pr_curve.png")
    plt.show()
    
    # Log PR Curve as artifact
    mlflow.log_artifact("pr_curve.png")
    
    # Save the model and log it with MLflow
    mlflow.sklearn.log_model(pipeline, "baseline_model")

### <b>Fitting an hyper-optimized model</b>

In [None]:
#- Start a new MLflow run
mlflow.set_experiment("optimized_model")

In [None]:
#- Define classifiers and their parameter grids
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Gaussian Naive Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "Support Vector Machine": SVC(probability=True),
}

In [None]:
def optuna_mlflow_tuner(trial):
    """
    Function to optimize hyperparameters of a wide range of classifiers using Optuna 
    and log detailed results with MLflow. Combines advanced feature engineering, 
    standardized scaling, and classifier evaluation with detailed metrics and visualizations.

    Parameters:
    -----------
    trial : optuna.trial.Trial
        The trial object provided by Optuna to suggest hyperparameters and track the results.

    Returns:
    --------
    float
        The main evaluation metric (accuracy) of the model on the validation set.
    """
    global best_model

    # Start an MLflow run for this trial
    with mlflow.start_run():
        # Select a classifier
        model_name = trial.suggest_categorical("model", list(classifiers.keys()))
        classifier = classifiers[model_name]

        # Suggest hyperparameters specific to the selected classifier
        if model_name == "Logistic Regression":
            solver = trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "saga", "liblinear"])
            C = trial.suggest_loguniform("C", 0.001, 100.0)
            classifier.set_params(solver=solver, C=C)

        elif model_name == "Random Forest":
            n_estimators = trial.suggest_int("n_estimators", 50, 1000, step=50)
            max_depth = trial.suggest_int("max_depth", 2, 100, step=5)
            min_samples_split = trial.suggest_int("min_samples_split", 2, 50, step=5)
            min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 50, step=5)
            classifier.set_params(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                n_jobs=-1
            )

        elif model_name == "Gaussian Naive Bayes":
            var_smoothing = trial.suggest_loguniform("var_smoothing", 1e-12, 1e-2)
            classifier.set_params(var_smoothing=var_smoothing)

        elif model_name == "K-Nearest Neighbors":
            n_neighbors = trial.suggest_int("n_neighbors", 1, 50)
            weights = trial.suggest_categorical("weights", ["uniform", "distance"])
            algorithm = trial.suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
            p = trial.suggest_int("p", 1, 5)  # Minkowski distance metric
            classifier.set_params(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, p=p)

        elif model_name == "Decision Tree":
            max_depth = trial.suggest_int("max_depth", 2, 100, step=5)
            min_samples_split = trial.suggest_int("min_samples_split", 2, 50, step=5)
            min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 50, step=5)
            criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
            classifier.set_params(
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                criterion=criterion
            )

        elif model_name == "XGBoost":
            max_depth = trial.suggest_int("max_depth", 3, 20)
            learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 1.0)
            n_estimators = trial.suggest_int("n_estimators", 50, 1000, step=50)
            subsample = trial.suggest_uniform("subsample", 0.5, 1.0)
            colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.5, 1.0)
            gamma = trial.suggest_loguniform("gamma", 1e-8, 10.0)
            classifier.set_params(
                max_depth=max_depth,
                learning_rate=learning_rate,
                n_estimators=n_estimators,
                subsample=subsample,
                colsample_bytree=colsample_bytree,
                gamma=gamma
            )

        elif model_name == "Support Vector Machine":
            kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
            C = trial.suggest_loguniform("C", 0.001, 100.0)
            classifier.set_params(kernel=kernel, C=C)

        # Build the pipeline
        pipeline = Pipeline([
            ("feature_engineering", FunctionTransformer(do_feature_eng, validate=False)),  # Replace resize with Feature Engineering
            ("scaler", StandardScaler()),
            ("classifier", classifier)
        ])

        # Train the pipeline
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred, average='weighted')
        recall = recall_score(y_val, y_pred, average='weighted')
        f1 = f1_score(y_val, y_pred, average='weighted')
        cm = confusion_matrix(y_val, y_pred)

        # Log metrics in MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_param("model", model_name)

        # Log hyperparameters
        params = classifier.get_params()
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)

        # Log the confusion matrix
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
        plt.title(f"Confusion Matrix - {model_name}")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Save the model if it's the best one found so far
        if trial.number == 0 or accuracy > study.best_value:
            best_model = pipeline

        return accuracy


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(optuna_mlflow_tuner, n_trials=15)

print("\nBest hyperparameters found:", study.best_params)

### <b>Testing</b>

In [None]:
def evaluate_model_on_test(model, X_test, y_test, class_names=None):
    """
    Evaluate a trained model on the test set and log metrics and visualizations.
    
    Parameters:
    -----------
    model : sklearn model
        The trained machine learning model (pipeline or standalone model).
    X_test : array-like
        The test set features.
    y_test : array-like
        The true labels of the test set.
    class_names : list, optional
        List of class names for visualization and metrics. Default is None.
    
    Returns:
    --------
    None
    """
    # Generate predictions
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Generate the classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()

    # Optional: Return the metrics as a dictionary
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }


In [None]:
evaluate_model_on_test(best_model, X_test, y_test, class_names)

## <b>Step 7:</b> Prediction

In [None]:
def make_predictions(model, X_input, class_names=None):
    """
    Make predictions using a trained model and optionally decode class labels.
    
    Parameters:
    -----------
    model : sklearn model
        The trained machine learning model (pipeline or standalone model).
    X_input : array-like
        The input data for which predictions need to be made.
    class_names : list, optional
        List of class names to decode numerical predictions into labels. Default is None.
    
    Returns:
    --------
    predictions : array-like
        Predicted class labels or probabilities.
    """
    # Make predictions
    y_pred = model.predict(X_input)

    # Optionally map numerical predictions to class names
    if class_names is not None:
        y_pred_decoded = [class_names[label] for label in y_pred]
        return y_pred_decoded

    return y_pred

In [None]:
#- Example usage:
X_to_predict = [[]]
predictions = make_predictions(best_model, X_to_predict, class_names)
print("Predictions:", predictions)

In [None]:
def make_prob_predictions(model, X_input, class_names=None):
    """
    Make probability predictions using a trained model.
    
    Parameters:
    -----------
    model : sklearn model
        The trained machine learning model (pipeline or standalone model).
    X_input : array-like
        The input data for which probability predictions need to be made.
    class_names : list, optional
        List of class names to associate probabilities with each class. Default is None.
    
    Returns:
    --------
    probabilities : array-like
        Predicted probabilities for each class.
    """
    # Make probability predictions
    y_proba = model.predict_proba(X_input)

    # Optionally pair probabilities with class names
    if class_names is not None:
        proba_dicts = [{class_names[i]: prob for i, prob in enumerate(probs)} for probs in y_proba]
        return proba_dicts

    return y_proba

In [None]:
# Example usage:
proba_predictions = make_prob_predictions(best_model, X_to_predict, class_names)
print("Probability Predictions:", proba_predictions)

In [None]:
data_path_2 = ".csv"
data_2 = pd.read_csv(data_path_2)
raw_data_2 = data_2.copy()
data_2.head(3)

In [None]:
basic_inspection(data_2, visualize_nulls=True, detailed_summary=True, check_duplicates=True)

In [None]:
X_features = ["feature1", "feature2", "feature3"]  # Replace with actual feature names
X_to_predict = data_2[X_features]
data_2 = data_cleaning(X_to_predict)

In [None]:
predictions = make_predictions(best_model, X_to_predict, class_names=class_names)
print("Predictions:")
print(predictions)

In [None]:
# Optional: If you want probabilities instead of class labels
probability_predictions = make_prob_predictions(best_model, X_to_predict, class_names=class_names)
print("\nProbability Predictions:")
print(probability_predictions)

In [None]:
# Add predictions to the original data
results_df = raw_data_2.copy()  # Copy the original data for context
results_df["Predicted Class"] = predictions  # Add predicted classes as a new column

# Optional: Add probabilities as new columns (if applicable)
if probability_predictions:
    for class_name in class_names:
        results_df[f"Probability ({class_name})"] = [prob[class_name] for prob in probability_predictions]

results_file_path = "predictions_results.csv"
results_df.to_csv(results_file_path, index=False)

print(f"Results exported successfully to {results_file_path}")