# Description: 
##### This script performs a 5-fold cross-validation on the ENTIRE dataset for each of the 6 data files.
* It does not create a separate hold-out test set. 
* The result is the average performance of models across the 5 folds. 
* This is useful for understanding how well models perform on the dataset as a whole when using cross-validation.

# Instructions:
1. Make sure you have PyCaret and its dependencies installed.
If not, uncomment the line below and run it once.
> !pip install pycaret pandas

2. Place this script in the same directory as your 90 CSV data files.

3. Run the script.

In [None]:
import os
import pandas as pd
from pycaret.classification import *

In [4]:
!pwd

/Users/leechangmin/Desktop/Project/ETRI-Emotion/src


In [None]:
def experiment_2_full_data_cv():
    """
    Runs 5-fold CV on the entire dataset without a hold-out set.
    """
    # --- Configuration ---
    TARGET_COLUMN_NAME = 'label'
    DATA_DIR = './data'
    RESULT_DIR = './res'
    GROUPS = ['Total', 'High', 'Low']
    VARIABLES = ['arousal', 'valence']
    
    # Create the result directory if it doesn't exist
    os.makedirs(RESULT_DIR, exist_ok=True)
    
    print("==========================================================")
    print("===  STARTING EXPERIMENT 2: 5-Fold CV on Full Dataset  ===")
    print("==========================================================")

    for group in GROUPS:
        for variable in VARIABLES:
            experiment_name = f"{group}_{variable}"
            file_path = os.path.join(DATA_DIR, f"{experiment_name}.csv")

            if not os.path.exists(file_path):
                print(f"\n--- Skipping {experiment_name}: File not found at {file_path} ---")
                continue

            print(f"\n--- Processing: {experiment_name} ---")
            
            # 1. Load the full dataset
            full_dataset = pd.read_csv(file_path)
            
            # 2. Setup PyCaret environment using the entire dataset.
            # PyCaret's functions will use 5-fold CV for all operations.
            s = setup(data=full_dataset,
                      target=TARGET_COLUMN_NAME,
                      fold=5,  # Specify 5-fold cross-validation
                      index=False,
                      session_id=123,
                      verbose=False)
            
            # 3. Compare models using 5-fold CV on the entire dataset
            print("  > Comparing models using 5-fold CV...")
            compare_models(verbose=False)
            
            # 4. Pull the results grid, which shows the average performance across the 5 folds
            cv_results = pull()
            
            # 5. Save the cross-validation results to a CSV file
            output_path = os.path.join(RESULT_DIR, f"exp2_cv_performance_{experiment_name}.csv")
            cv_results.to_csv(output_path)
            print(f"  > Average 5-fold CV performance saved to {output_path}")


In [None]:
if __name__ == '__main__':
    experiment_2_full_data_cv()
    print("\n\nExperiment 2 is complete.")

### Load Results

In [1]:
def load_all_results():
    """
    Read 'results_{group}_{variable}_classification_summary.csv' formatted files
    and return a dictionary of DataFrames.
    """
    # --- Configuration (can be adjusted as needed) ---
    groups = ['Total', 'High', 'Low']
    variables = ['arousal', 'valence']
    
    # dictionary to hold all results
    all_results = {}
    
    print("Reading all result CSV files...")

    # Iterate through each group and variable to construct filenames
    for group in groups:
        for variable in variables:
            # Construct the filename based on the group and variable
            experiment_name = f"{group}_{variable}"
            filename = f"./res/results_{experiment_name}_classification_summary.csv"
            
            try:
                # CSV file into DataFrame
                # index_col=0 to use the first column as index
                df = pd.read_csv(filename, index_col=0)
                
                # Store the DataFrame in the dictionary with the experiment name as key
                all_results[experiment_name] = df
                print(f"  - Successfully loaded: {filename}")
                
            except FileNotFoundError:
                print(f"  - File not found, skipping: {filename}")
                
    return all_results

In [None]:
if __name__ == '__main__':
    loaded_results = load_all_results()
    print("\n--- Summary of Loaded Results ---")
    
    if not loaded_results:
        print("No result files were found.")
    else:
        # Display the top 5 models for each loaded result
        for name, result_df in loaded_results.items():
            print(f"\n--- Top 5 Models for [{name}] ---")
            print(result_df.head())