### Conduct raw data quality check - visual inspection

In [None]:
import os
import pickle as pkl
import sys
sys.path.append('/Users/kana/Library/Mobile Documents/com~apple~CloudDocs/Codes/GWOT_colorprefrencequalia')
import numpy as np
import pandas as pd
import sklearn
import csv
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import MDS
import seaborn as sns
import ot
import plotly.graph_objs as go
import plotly.express as px
from mpl_toolkits.axes_grid1 import make_axes_locatable
import utilityFunctions
print(sys.path)
from itertools import combinations

In [65]:
# Define unique colors
unique_colours = np.array(['#d2b700', '#db8b08', '#c7512c', '#c13547', '#a03663', '#753a7a', '#4b488e', '#005692', '#006a8b', '#007b75', '#008a52', '#9aa400'])
colour_index = {colour: idx for idx, colour in enumerate(unique_colours)}
matrix_size = len(unique_colours)

### configuration
n_eps = 15 # number of epsilon values tried
eps_range = [0.04, 5] # the range of epsilon searched
epsilons = np.logspace(np.log10(eps_range[0]), np.log10(eps_range[1]), n_eps) # epsilon values

In [None]:
def process_csv(file_path):
    df = pd.read_csv(file_path)
    
    # Step 2: Eliminate rows where practice_trial == 1
    df = df[df['practice_trial'] != 1]
    
    # Step 3: Filter rows where response_type == "similarity"
    df = df[df['response_type'] == "similarity"]
    
    # Step 4: Look at the first 25 trials.thisIndex values
    first_25_indices = df['trials.thisIndex'].iloc[:25].tolist()
    
    # Step 5 & 6: Filter and map trials.thisIndex to sequential range
    filtered_df = df[df['trials.thisIndex'].isin(first_25_indices)]
    unique_indices = list(filtered_df['trials.thisIndex'].unique())[:25]
    index_mapping = {original: new for new, original in enumerate(unique_indices, start=1)}
    filtered_df['transformed_index'] = filtered_df['trials.thisIndex'].map(index_mapping)
    
    # Step 7: Set up the plot
    grouped = filtered_df.groupby('transformed_index')
    plt.figure(figsize=(10, 6))
    
    response_pairs = []  # To collect response pairs for correlation calculation
    
    for transformed_index, group in grouped:
        x = [transformed_index] * len(group)
        y = group['response'].tolist()
        
        # Check for identical y-values and set scatter size
        identical_y_values = len(set(y)) == 1
        scatter_size = 100 if identical_y_values else 50
        
        # Plot scatter points
        plt.scatter(x, y, color='blue', s=scatter_size)
        
        # Step 8 & 9: Plot connection between points if there are two rows per x
        if len(y) == 2:
            plt.plot([transformed_index, transformed_index], y, color='red', linewidth=1)
            response_pairs.append(y)  # Collect pairs for correlation calculation
    
    # Calculate overall correlation between paired responses
    if response_pairs:
        first_responses = [pair[0] for pair in response_pairs]
        second_responses = [pair[1] for pair in response_pairs]
        pair_correlation = pd.Series(first_responses).corr(pd.Series(second_responses))
    else:
        pair_correlation = None  # Handle case where no pairs exist
    
    # Display correlation in a textbox
    plt.xlabel('Trial Index')
    plt.ylabel('Response')
    plt.title(f'Double pass r: {pair_correlation:.2f}')
    
    # Save plot as PNG
    plt.tight_layout()
    plt.show()

# Process all CSV files in the specified folder
folder_path = '/Users/kana/Library/Mobile Documents/com~apple~CloudDocs/Codes/GWOT_raw_data/202411_data'
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        process_csv(file_path)