# Description: 
##### This script performs a standard machine learning workflow. For each of the 6 datasets, it first splits the data into a training set (80%) and a hold-out test set (20%).
* It then runs a 5-fold cross-validation on the training set to find the best model.
* Finally, it evaluates the best model on the hold-out test set to get the final performance score and saves this result.

# Instructions:
1. Make sure you have PyCaret and its dependencies installed.
If not, uncomment the line below and run it once.
> !pip install pycaret pandas

2. Place this script in the same directory as your 90 CSV data files.

3. Run the script.

In [None]:
import os
import pandas as pd
from pycaret.classification import *

In [4]:
!pwd

/Users/leechangmin/Desktop/Project/ETRI-Emotion/src


In [None]:
def experiment_1_standard_cv_with_holdout():
    """
    Runs the standard CV workflow with a hold-out test set.
    """
    # --- Configuration ---
    TARGET_COLUMN_NAME = 'label'
    DATA_DIR = './data'
    RESULT_DIR = './res'
    GROUPS = ['Total', 'High', 'Low']
    VARIABLES = ['arousal', 'valence']
    
    # Create the result directory if it doesn't exist
    os.makedirs(RESULT_DIR, exist_ok=True)

    print("=====================================================================")
    print("===  STARTING EXPERIMENT 1: Standard CV with a Single Hold-Out  ===")
    print("=====================================================================")

    for group in GROUPS:
        for variable in VARIABLES:
            experiment_name = f"{group}_{variable}"
            file_path = os.path.join(DATA_DIR, f"{experiment_name}.csv")
            
            if not os.path.exists(file_path):
                print(f"\n--- Skipping {experiment_name}: File not found at {file_path} ---")
                continue
                
            print(f"\n--- Processing: {experiment_name} ---")
            
            # 1. Load the full dataset
            full_dataset = pd.read_csv(file_path)
            
            # 2. Setup PyCaret environment.
            # It will automatically split the data into 80% train and 20% test.
            # The 5-fold CV will only be performed on the 80% training portion.
            s = setup(data=full_dataset,
                      target=TARGET_COLUMN_NAME,
                      train_size=0.8,  # Use 80% for training, 20% is held out as test
                      fold=5,          # Specify 5-fold cross-validation
                      index=False,
                      session_id=123,
                      verbose=False)
            
            # 3. Compare models on the training set using 5-fold CV
            print("  > Comparing models using 5-fold CV on the training set...")
            best_model = compare_models(verbose=False)
            
            # 4. Finalize the best model (retrain on the entire training set)
            print(f"  > Finalizing the best model: {pull().iloc[0,0]}")
            final_model = finalize_model(best_model)
            
            # 5. Evaluate the final model on the unseen hold-out test set (20%)
            print("  > Evaluating the final model on the hold-out test set...")
            test_predictions = predict_model(final_model, data=s.X_test)
            
            # The performance metrics are on the last row of the output
            test_metrics = test_predictions.iloc[-1:].copy()
            test_metrics.drop('Model', axis=1, inplace=True)
            test_metrics.index = [pull().iloc[0,0]] # Set index to the model name
            
            # 6. Save the test set performance to a CSV file
            output_path = os.path.join(RESULT_DIR, f"exp1_test_performance_{experiment_name}.csv")
            test_metrics.to_csv(output_path)
            print(f"  > Test performance saved to {output_path}")


In [None]:
if __name__ == '__main__':
    experiment_1_standard_cv_with_holdout()
    print("\n\nExperiment 1 is complete.")

### Load Results

In [1]:
def load_all_results():
    """
    Read 'results_{group}_{variable}_classification_summary.csv' formatted files
    and return a dictionary of DataFrames.
    """
    # --- Configuration (can be adjusted as needed) ---
    groups = ['Total', 'High', 'Low']
    variables = ['arousal', 'valence']
    
    # dictionary to hold all results
    all_results = {}
    
    print("Reading all result CSV files...")

    # Iterate through each group and variable to construct filenames
    for group in groups:
        for variable in variables:
            # Construct the filename based on the group and variable
            experiment_name = f"{group}_{variable}"
            filename = f"./res/results_{experiment_name}_classification_summary.csv"
            
            try:
                # CSV file into DataFrame
                # index_col=0 to use the first column as index
                df = pd.read_csv(filename, index_col=0)
                
                # Store the DataFrame in the dictionary with the experiment name as key
                all_results[experiment_name] = df
                print(f"  - Successfully loaded: {filename}")
                
            except FileNotFoundError:
                print(f"  - File not found, skipping: {filename}")
                
    return all_results

In [None]:
if __name__ == '__main__':
    loaded_results = load_all_results()
    print("\n--- Summary of Loaded Results ---")
    
    if not loaded_results:
        print("No result files were found.")
    else:
        # Display the top 5 models for each loaded result
        for name, result_df in loaded_results.items():
            print(f"\n--- Top 5 Models for [{name}] ---")
            print(result_df.head())