In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt

In [3]:
# Directory where files are stored 
DIRECTORY = 'minmax_subjects/'  # Adjust this path as needed

# List of columns to be dropped
drop_ls = [
    "expected_time",
    "flip_time",
    "stim_pos",
    "user_pos",
    "lambda_val",
    "change_rate_x",
]

# Load all CSV files and limit to the first 3 subjects
arr = [f for f in os.listdir(DIRECTORY) if f.endswith('.csv')]
first_three = arr[:3]

# Define target columns and their calculations
targets = {
    "y_pos": "stim_pos",                  # Stimulus position
    "y_speed": "change_rate_x",           # Change rate in x
    "y_pos_dif": "user_pos - stim_pos"    # Difference between user position and stimulus position
}

# Perform LOO-CV for each target
for target_name, target_column in targets.items():
    scores = []
    print(f"Running LOO-CV for target: {target_name}")

    # Leave-one-out cross-validation for each subject
    for i, subject_file in enumerate(first_three):
        # Load the test subject's data
        test_subject = pd.read_csv(os.path.join(DIRECTORY, subject_file))

        # Prepare test data (X, y)
        X_test = test_subject.drop(columns=drop_ls).to_numpy()
        
        # Set y based on target
        if target_name == "y_pos_dif":
            y_test = test_subject["user_pos"].to_numpy() - test_subject["stim_pos"].to_numpy()
        else:
            y_test = test_subject[target_column].to_numpy()

        # Prepare training data by combining all other subjects
        X_train = []
        y_train = []
        for j, other_subject_file in enumerate(arr):
            if j != i:  # Exclude the test subject
                other_subject = pd.read_csv(os.path.join(DIRECTORY, other_subject_file))
                X_train.append(other_subject.drop(columns=drop_ls).to_numpy())
                
                # Set y for training based on target
                if target_name == "y_pos_dif":
                    y_train.append(other_subject["user_pos"].to_numpy() - other_subject["stim_pos"].to_numpy())
                else:
                    y_train.append(other_subject[target_column].to_numpy())
              
        # Concatenate all training data
        X_train = np.vstack(X_train)
        y_train = np.concatenate(y_train)

        # Initialize and train the model
        rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=23)
        rf.fit(X_train, y_train)
        
        # Predict on the test subject and calculate R^2 score
        y_pred = rf.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        scores.append(r2)

        print(f"Subject {i+1}/{len(arr)} - {target_name} R2 Score: {r2}")

    # Save scores to a CSV file
    scores_df = pd.DataFrame(scores, columns=[f"{target_name} R2 Score"])
    scores_df.to_csv(f"loo_cv_{target_name}_scores.csv", index=False)

    # Plot LOO-CV R2 scores
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.plot(range(1, len(scores) + 1), scores, marker='o', linestyle='-', alpha=0.7)
    ax.set_xlabel("Subject", fontsize=15)
    ax.set_ylabel("R2 Score", fontsize=15)
    ax.set_title(f"Leave-One-Out Cross-Validation R2 Scores for |{target_column}|", fontsize=15)
    plt.grid(True)
    plt.show()


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'minmax_subjects/'