In [None]:
import os
import numpy as np
import pandas as pd
import utilityFunctions
from itertools import combinations

# Configuration
folder_path = 'raw_data/202411_data_1'
response_type = 'similarity'
n_eps = 40  # Number of epsilon values tried
eps_range = [0.1, 2]  # Range of epsilon searched
epsilons = np.logspace(np.log10(eps_range[0]), np.log10(eps_range[1]), n_eps)  # Epsilon values

# Define unique colors
unique_colours = np.array(['#d2b700', '#db8b08', '#c7512c', '#c13547', '#a03663', '#753a7a', '#4b488e', '#005692', '#006a8b', '#007b75', '#008a52', '#9aa400'])
colour_index = {colour: idx for idx, colour in enumerate(unique_colours)}
matrix_size = len(unique_colours)

# Function to load and process a CSV file into a matrix
def load_csv_to_matrix(file_path, response_type, colour_index, matrix_size):
    df = pd.read_csv(file_path)
    df = df[(df['practice_trial'] != 1) & (df['response_type'] == response_type)]

    colour1 = df['colour1']
    colour2 = df['colour2']
    target_preference = df['response']

    matrix = np.zeros((matrix_size, matrix_size))
    for c1, c2, tp in zip(colour1, colour2, target_preference):
        I = colour_index[c1]
        j = colour_index[c2]
        matrix[I, j] = tp

    return matrix.astype(int)

# Function to sort files, rename them, and assign subject IDs
def sort_and_rename_files(folder_path):
    csv_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')])
    subject_ids = {}

    for idx, file in enumerate(csv_files):
        prefix = file[:6]  # Extract the first 6 characters from the original file name
        new_name = f"subject_{idx + 1:02d}_{prefix}.csv"
        old_path = os.path.join(folder_path, file)
        new_path = os.path.join(folder_path, new_name)
        os.rename(old_path, new_path)
        subject_ids[new_name] = idx + 1

    return list(subject_ids.keys()), subject_ids

# Sort and rename files, then assign subject IDs
csv_files, subject_ids = sort_and_rename_files(folder_path)

# Generate all possible pairs of files
file_pairs = list(combinations(csv_files, 2))

# Initialize lists to store results
file1_list = []
file2_list = []
subject_id1_list = []
subject_id2_list = []
epsilon_range_list = []
gwds_list = []
ot_plans_list = []
matching_rates_list = []
RSA_correlations_list = []
min_gwds_list = []
best_epsilons_list = []
best_matching_rates_list = []

# Process each pair
for file1, file2 in file_pairs:
    print(f"Processing pair: {file1} and {file2}")

    # Load matrices
    matrix_1 = load_csv_to_matrix(os.path.join(folder_path, file1), response_type, colour_index, matrix_size)
    matrix_2 = load_csv_to_matrix(os.path.join(folder_path, file2), response_type, colour_index, matrix_size)

    # Extract subject IDs for titles
    subject_id1 = f"Subject {subject_ids[file1]}"
    subject_id2 = f"Subject {subject_ids[file2]}"

    # Show heatmaps
    utilityFunctions.show_heatmaps(0, 7, matrices=[matrix_1, matrix_2], titles=[subject_id1, subject_id2], cbar_label="similarity", color_labels=unique_colours)

    # Calculate RSA correlation
    RSA_corr = utilityFunctions.RSA(matrix_1, matrix_2)
    print(f'RSA correlation coefficient between {file1} and {file2}: {RSA_corr}')

    # Perform GWOT
    OT_plan, gwds, matching_rates = utilityFunctions.GWD_and_plot(matrix_1, matrix_2, epsilons)
    min_gwd_index = np.argmin(gwds)
    min_gwd = gwds[min_gwd_index]
    best_epsilon = epsilons[min_gwd_index]
    best_matching_rate = matching_rates[min_gwd_index]

    print(f"Minimum GWD for {file1} and {file2}: {min_gwd} at epsilon {best_epsilon} with matching rate {best_matching_rate}")

    # Append results to lists
    file1_list.append(file1)
    file2_list.append(file2)
    subject_id1_list.append(subject_id1)
    subject_id2_list.append(subject_id2)
    epsilon_range_list.append(eps_range)
    gwds_list.append(gwds)
    ot_plans_list.append(OT_plan)
    matching_rates_list.append(matching_rates)
    RSA_correlations_list.append(RSA_corr)
    min_gwds_list.append(min_gwd)
    best_epsilons_list.append(best_epsilon)
    best_matching_rates_list.append(best_matching_rate)

# Convert lists to numpy arrays
file1_array = np.array(file1_list, dtype=object)
file2_array = np.array(file2_list, dtype=object)
subject_id1_array = np.array(subject_id1_list, dtype=object)
subject_id2_array = np.array(subject_id2_list, dtype=object)
epsilon_range_array = np.array(epsilon_range_list, dtype=object)
gwds_array = np.array(gwds_list, dtype=object)
ot_plans_array = np.array(ot_plans_list, dtype=object)
matching_rates_array = np.array(matching_rates_list, dtype=object)
RSA_correlations_array = np.array(RSA_correlations_list, dtype=object)
min_gwds_array = np.array(min_gwds_list, dtype=float)
best_epsilons_array = np.array(best_epsilons_list, dtype=float)
best_matching_rates_array = np.array(best_matching_rates_list, dtype=float)

# Combine all arrays into a single numpy array
all_results_array = np.array([
    file1_array,
    file2_array,
    subject_id1_array,
    subject_id2_array,
    epsilon_range_array,
    gwds_array,
    ot_plans_array,
    matching_rates_array,
    RSA_correlations_array,
    min_gwds_array,
    best_epsilons_array,
    best_matching_rates_array
], dtype=object)

# Save the combined array to a .npz file
output_file = os.path.join(folder_path, 'pairwise_results_extended.npz')
np.savez(output_file,
         file1=file1_array,
         file2=file2_array,
         subject_id1=subject_id1_array,
         subject_id2=subject_id2_array,
         epsilon_range=epsilon_range_array,
         gwds=gwds_array,
         ot_plans=ot_plans_array,
         matching_rates=matching_rates_array,
         RSA_correlations=RSA_correlations_array,
         min_gwds=min_gwds_array,
         best_epsilons=best_epsilons_array,
         best_matching_rates=best_matching_rates_array)

print(f"Results saved to {output_file}")
