In [None]:
from constants import get_feature_sets  
import pickle
import numpy as np

X = pd.read_pickle('data/cached_X_imputed.pckl')
y = pd.read_pickle('data/cached_y.pckl')
feature_sets = get_feature_sets(X)

In [None]:
print(X.groupby("Source").size())
print(X.groupby("Source")["BALANCE_ID"].nunique())

print(X.shape)
print(X["BALANCE_ID"].nunique())

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

# this splitter stratifies according to the "Source" field. 
# This means that each split should have the same distribution of values for the "Source" column.
class SourceStratifiedKFold:
    def __init__(self):
        self.kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

    def split(self, df_features, df_targets, groups=None):
        return self.kfold.split(df_features, df_features.Source, df_features.BALANCE_ID)

    def get_n_splits(self, X, y, groups=None):
        return self.kfold.get_n_splits(X, y, groups)

def validate():
    kf = SourceStratifiedKFold()
    for train_idx, test_idx in list(kf.split(X, y))[:1]:
        x_train = X.iloc[train_idx]
        x_test = X.iloc[test_idx]

        y_train = y.iloc[train_idx]
        y_test = y.iloc[test_idx]

        x_train_ids = set(x_train.BALANCE_ID.values)
        x_test_ids = set(x_test.BALANCE_ID.values)

        if len(x_train_ids.intersection(x_test_ids)) > 0:
            raise Exception("train and test data overlap!")
        display("all good!")
validate()        

# Training models

In [None]:
from sklearn.calibration import CalibratedClassifierCV
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.metrics import (
    roc_auc_score, f1_score, accuracy_score, balanced_accuracy_score, 
    recall_score, average_precision_score, brier_score_loss
)
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer

model = Pipeline([
    ('columnSelect', ColumnTransformer(
        [
            *[(feature, 'passthrough', [feature]) for feature in feature_sets['all'] if feature in X.columns],
        ],
        remainder='drop'
    )),
    ('scaler', None),       # Standardize the features
    ('classifier', None)    # Base classifier  
    
])
params = [
    {
        'classifier': [
            ExtraTreesClassifier(random_state=42),
            LogisticRegression(random_state=42, max_iter=1000),
            HistGradientBoostingClassifier(random_state=42),
            MLPClassifier(random_state=42),
          ],

         'scaler': [
            StandardScaler(),
            MinMaxScaler(),
            None
        ]
   }
]
results = {}  # Initialize the results dictionary

# Iterate through each target
for target in targets:
    display(f"Processing target: {target}")
    
    # Initialize a nested dictionary for the target
    results[target] = {}

    # Perform GridSearchCV
    cv = GridSearchCV(
        model, params, refit="roc_auc", cv=SourceStratifiedKFold(), verbose=3, n_jobs=1,
        scoring={'f1_weighted': 'f1_weighted', 'f1': 'f1', 'roc_auc': 'roc_auc', 'accuracy': 'accuracy',
                 'balanced_accuracy': 'balanced_accuracy', 'recall_weighted': 'recall_weighted', 'average_precision': 'average_precision'
                }
    )
    cv.fit(X, y[target])

    with open(f'best_model_{target}.pkl', 'wb') as file:
        pickle.dump(cv.best_estimator_, file)

    # Save best model (before calibration)
    best_model = cv.best_estimator_
    
    # Now apply post-hoc calibration
    calibrated_model = CalibratedClassifierCV(best_model, method='sigmoid')  
    calibrated_model.fit(X, y[target])  

    with open(f'calibrated_model_{target}.pkl', 'wb') as file:
        pickle.dump(calibrated_model, file)

    # Extract classifiers used in the GridSearchCV
    classifiers = cv.cv_results_['param_classifier']

    # Extract scores for each metric and store them in the nested dictionary
    for idx, classifier in enumerate(classifiers):
        classifier_name = type(classifier).__name__

        if classifier_name not in results[target]:
            results[target][classifier_name] = {
                'f1_weighted': [],
                'f1_weighted_sd': [],
                'f1': [],
                'f1_sd': [],
                'roc_auc': [],
                'roc_auc_sd': [],
                'accuracy': [],
                'accuracy_sd': [],
                'balanced_accuracy': [],
                'balanced_accuracy_sd': [],
                'recall_weighted': [],
                'recall_weighted_sd': [],
                'average_precision': [],
                'average_precision_sd': []
            }

            # Append mean scores and standard deviations
            results[target][classifier_name]['f1_weighted'].append(cv.cv_results_['mean_test_f1_weighted'][idx])
            results[target][classifier_name]['f1_weighted_sd'].append(cv.cv_results_['std_test_f1_weighted'][idx])
            
            results[target][classifier_name]['f1'].append(cv.cv_results_['mean_test_f1'][idx])
            results[target][classifier_name]['f1_sd'].append(cv.cv_results_['std_test_f1'][idx])
            
            results[target][classifier_name]['roc_auc'].append(cv.cv_results_['mean_test_roc_auc'][idx])
            results[target][classifier_name]['roc_auc_sd'].append(cv.cv_results_['std_test_roc_auc'][idx])
            
            results[target][classifier_name]['accuracy'].append(cv.cv_results_['mean_test_accuracy'][idx])
            results[target][classifier_name]['accuracy_sd'].append(cv.cv_results_['std_test_accuracy'][idx])
            
            results[target][classifier_name]['balanced_accuracy'].append(cv.cv_results_['mean_test_balanced_accuracy'][idx])
            results[target][classifier_name]['balanced_accuracy_sd'].append(cv.cv_results_['std_test_balanced_accuracy'][idx])
            
            results[target][classifier_name]['recall_weighted'].append(cv.cv_results_['mean_test_recall_weighted'][idx])
            results[target][classifier_name]['recall_weighted_sd'].append(cv.cv_results_['std_test_recall_weighted'][idx])
            
            results[target][classifier_name]['average_precision'].append(cv.cv_results_['mean_test_average_precision'][idx])
            results[target][classifier_name]['average_precision_sd'].append(cv.cv_results_['std_test_average_precision'][idx])

# Storing results

In [None]:
# Flatten the results dictionary into a list of rows, with each metric as a separate column
rows = []

for target, classifiers in results.items():
    for classifier_name, metrics in classifiers.items():
        row = {
            'Target': target,
            'Classifier': classifier_name
        }
        # Add each metric as a column
        for metric_name, scores in metrics.items():
            row[metric_name] = round(np.mean(scores), 3)  # Round to 3 decimals
        
        rows.append(row)  # Append the row

# Create a DataFrame
results_df = pd.DataFrame(rows)

# Display the DataFrame
display(results_df)

# Export to CSV
results_df.to_csv('results_classifier_selection.csv', index=False)

# Creating documentation 

In [None]:
!conda list > conda_packages.txt

In [None]:
import pickle

def inspect_pipeline_steps(targets, steps_to_print=['classifier', 'scaler']):
    
    # Loads multiple pipeline models from .pkl files and prints details of specific named steps.

    for target in targets:
        try:
            # Load the model from file
            with open(f'best_model_{target}.pkl', 'rb') as file:
                model = pickle.load(file)

            print(f"\n Model for target: {target}")

            # Check if the loaded object is a Pipeline
            if not hasattr(model, "named_steps"):
                print(" Loaded model is not a Pipeline.")
                continue

            # Loop through specified pipeline steps
            for step in steps_to_print:
                if step in model.named_steps:
                    print(f"   {step}: {model.named_steps[step]}")
                else:
                    print(f"   {step} step not found in the pipeline.")
        
        except FileNotFoundError:
            print(f" Model file not found: best_model_{target}.pkl")
        except Exception as e:
            print(f" Error loading model for {target}: {e}")

inspect_pipeline_steps(targets)