## Load libraries, define paths and iterables

In [None]:
import os

from sklearn.base import clone

from modules.training import fit_model
from modules.utils import load_dataset, load_object, save_object

In [None]:
# Create dataset and model configurations
# Directories for training data and saved models
training_dir = '../data/processed/train'
models_dir = '../models'

# List all training dataset files
dataset_paths = os.listdir(training_dir)

# Extract dataset base names (portion before the first '-')
dataset_names = [path.split('-')[0] for path in dataset_paths]

# Pair dataset names with their corresponding file paths
dataset_name_paths = list(zip(dataset_names, dataset_paths))

# Define model identifiers used across experiments
model_types = ['logreg', 'dt', 'rf', 'xgb', 'svc']

## Retrain models outside of grid search and compress

In [None]:
# Lists to track successfully retrained and failed models
retrained_models = []
failed_models = []

# Retrain each model as a standalone model (no grid search) and save with compression
for name, dataset_path in dataset_name_paths:
    print(f'Loading {name} dataset.')
    
    # Load training dataset and split into features and target
    train_dataset = load_dataset(os.path.join(training_dir, dataset_path))
    X_train = train_dataset.iloc[:, :-1]
    y_train = train_dataset.iloc[:, -1]

    # Directory for saving retrained models
    model_subdir = os.path.join(models_dir, '-'.join(name.split('_')))

    # Loop through all model types
    for model_type in model_types:
        model_path = os.path.join(model_subdir, f'{name}-{model_type}-full.pickle')
        try:
            # Load previously trained model and extract the estimator
            model = load_object(model_path).best_estimator_.get_params()[model_type]
            print(f'Retraining {model_type} on {name}.')
            save_path = os.path.join(model_subdir, f'{name}-{model_type}-full')

            print(model.get_params())
    
            # Retrain model without grid search and save using LZMA compression
            fit_model(
                X_train = X_train,
                y_train = y_train,
                model_name = '',
                model = clone(model),
                grid_search = False,
                save = True,
                save_path = save_path,
                compression = 'lzma'
            )
            # Track successfully retrained models
            retrained_models.append(model_path)
        except Exception as e:
            # Log and track models that fail during retraining
            print(str(e))
            failed_models.append(model_path)