In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import plotly.graph_objs as go
from numpy import array, mean, std
import biosignalsnotebooks as bsnb

In [None]:
# List of directories (assuming they are in the current working directory)
dirs = [f"Subject {i}" for i in range(1, 11)]

for dir in dirs:
    for i in range(1, 7):
        # Load the processed file
        filepath = os.path.join(dir, f"movie_{i}_processed.csv")
        df = pd.read_csv(filepath)

        # Calculate z-scores for the mentioned columns
        columns_to_zscore = ['average_DisplayArea_y', 'average_DisplayArea_x', 'pupil Diameter_average']
        for col in columns_to_zscore:
            df[f'z_score_{col}'] = (df[col] - df[col].mean()) / df[col].std()

        # Save the file with the new z-score columns
        df.to_csv(filepath, index=True)

In [None]:
columns_of_interest = [
    'BVP_cleaned', 'EDA_cleaned', 'PZT_cleaned', 'z_score_pupil Diameter_average',
    'z_score_average_DisplayArea_y',
    'z_score_average_DisplayArea_x'
]

In [None]:
columns_of_interest = [
    'z_score_BVP', 'z_score_EDA', 'z_score_PZT',
    'z_score_pupil Diameter_average',
    'z_score_average_DisplayArea_y',
    'z_score_average_DisplayArea_x'
]

In [None]:
# Merge and Store Data
dirs = [f"Subject {i}" for i in range(1, 11)]
movie_dataframes = {}
for i in range(1, 7):
    subject_dfs = {}
    for dir in dirs:
        filepath_opensignals = os.path.join(dir, f"movie_{i}_opensignals.csv")
        filepath_processed = os.path.join(dir, f"movie_{i}_processed.csv")
        
        df_opensignals = pd.read_csv(filepath_opensignals)
        df_processed = pd.read_csv(filepath_processed)
        
        merged_df = df_opensignals.join(df_processed, lsuffix='_opensignals', rsuffix='_processed')
        sub_df = merged_df[columns_of_interest]
        subject_dfs[dir] = sub_df

    movie_dataframes[i] = subject_dfs
movie_subject_correlations = {}

for movie, subjects in movie_dataframes.items():
    corr_dict = {}
    for subject, df in subjects.items():
        corr_dict[subject] = df.corr()
    movie_subject_correlations[movie] = corr_dict

overall_correlation_per_movie = {}

for movie, correlations in movie_subject_correlations.items():
    # This will average the correlations for the columns across all subjects
    averaged_corrs = pd.DataFrame({subject: corr.mean() for subject, corr in correlations.items()})
    overall_corr = averaged_corrs.corr()
    overall_correlation_per_movie[movie] = overall_corr

# Display the overall correlation between subjects for the first movie
_ = overall_correlation_per_movie[1]


In [None]:
# Visualize the overall correlation between subjects for all movies in one plot

num_movies = len(overall_correlation_per_movie)
fig, axes = plt.subplots(nrows=num_movies, figsize=(10, 8 * num_movies))

for idx, (movie, correlation) in enumerate(overall_correlation_per_movie.items()):
    ax = axes[idx]
    sns.heatmap(correlation, annot=True, cmap='viridis', ax=ax)
    ax.set_title(f"Movie {movie}")

    # Add caption below each plot
    ax.text(0.5, -0.3, f"Parameters Tested: {', '.join(columns_of_interest)}", 
            transform=ax.transAxes, fontsize=9, ha="center", wrap=True)

plt.tight_layout(pad=4.0)  # Increased padding to accommodate the caption
# plt.show()

In [None]:
# Randomly select two groups of subjects
all_subjects = dirs
group_1 = random.sample(all_subjects, 5)
group_2 = [subj for subj in all_subjects if subj not in group_1]


def get_aggregated_data_for_group(group):
    aggregated_data = {col: [] for col in columns_of_interest}
    for movie_num, subjects_data in movie_dataframes.items():
        for column in columns_of_interest:
            group_data = [subjects_data[subject][column] for subject in group]
            concatenated_data = pd.concat(group_data).groupby(level=0).mean()
            aggregated_data[column].extend(concatenated_data)
    return aggregated_data

group_1_data = get_aggregated_data_for_group(group_1)
group_2_data = get_aggregated_data_for_group(group_2)

# Convert aggregated data for both groups into DataFrames
df_group_1 = pd.DataFrame(group_1_data)
df_group_2 = pd.DataFrame(group_2_data)

# Calculate correlation matrices for both groups
corr_matrix_group_1 = df_group_1.corr()
corr_matrix_group_2 = df_group_2.corr()

print("Correlation Matrix for Group 1:")
print(corr_matrix_group_1)
print("\nCorrelation Matrix for Group 2:")
print(corr_matrix_group_2)

# Visualization
movie_end_points = [0]
for movie_num, subjects_data in movie_dataframes.items():
    movie_end_points.append(movie_end_points[-1] + len(subjects_data[next(iter(subjects_data))]))
movie_end_points = movie_end_points[:-1]

for column in columns_of_interest:
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=group_1_data[column], mode='lines', name=f'Group 1 - {column}'))
    fig.add_trace(go.Scatter(y=group_2_data[column], mode='lines', name=f'Group 2 - {column}'))

    for end in movie_end_points:
        fig.add_shape(type="line", x0=end, x1=end, y0=min(group_1_data[column] + group_2_data[column]),
                      y1=max(group_1_data[column] + group_2_data[column]), line=dict(color="gray", width=1, dash="dot"))

    fig.update_layout(
        title=f'Comparison for {column}',
        xaxis_title='Sample Number',
        yaxis_title='Average Value',
        annotations=[
            dict(x=0.5, y=-0.3, showarrow=False, text="Gray dashed lines indicate the end of each movie.", xref="paper", yref="paper", font=dict(size=14)),
            dict(x=0.5, y=1.2, showarrow=False, text=f"Subjects in Group 1: {', '.join(group_1)}", xref="paper", yref="paper", font=dict(size=14)),
            dict(x=0.5, y=1.1, showarrow=False, text=f"Subjects in Group 2: {', '.join(group_2)}", xref="paper", yref="paper", font=dict(size=14))
        ],
        margin=dict(t=100, b=80)
    )
    # fig.show()


In [None]:
def pad_with_zeros(a, target_length):
    """Pad array a with zeros to target_length."""
    return np.concatenate((a, np.zeros(target_length - len(a))))

def get_padded_data(group_1_data, group_2_data):
    max_len = max(len(group_1_data), len(group_2_data))
    group_1_data = pad_with_zeros(group_1_data, max_len)
    group_2_data = pad_with_zeros(group_2_data, max_len)
    return group_1_data, group_2_data

# Initialize a matrix to store correlation coefficients
correlation_matrix = np.zeros((len(columns_of_interest), len(columns_of_interest)))

# Calculate the correlation for each combination of columns
for i, col_name_1 in enumerate(columns_of_interest):
    for j, col_name_2 in enumerate(columns_of_interest):
        group_1_data = np.nan_to_num(get_aggregated_data_for_group(group_1)[col_name_1])
        group_2_data = np.nan_to_num(get_aggregated_data_for_group(group_2)[col_name_2])
        
        group_1_data, group_2_data = get_padded_data(group_1_data, group_2_data)
        
        # The [0, 1] index from np.corrcoef() gives the correlation between the two sets
        correlation_matrix[i, j] = np.corrcoef(group_1_data, group_2_data)[0, 1]
        

# Plot the correlation matrix
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, ax=ax, cmap='coolwarm', vmin=0, vmax=1, square=True, 
            xticklabels=columns_of_interest, yticklabels=columns_of_interest)
ax.set_title("Correlation Matrix - Group 1 vs. Group 2")
plt.tight_layout()
plt.show()

In [None]:

def pad_with_zeros(a, target_length):
    """Pad array a with zeros to target_length."""
    return np.concatenate((a, np.zeros(target_length - len(a))))

def get_padded_data(group_1_data, group_2_data):
    max_len = max(len(group_1_data), len(group_2_data))
    group_1_data = pad_with_zeros(group_1_data, max_len)
    group_2_data = pad_with_zeros(group_2_data, max_len)
    return group_1_data, group_2_data

def is_empty_or_all_zeros(data):
    """Check if the array is empty or contains all zeros."""
    return len(data) == 0 or np.all(data == 0)

def plot_correlation(group_1_data, group_2_data, data_name):
    group_1_data, group_2_data = get_padded_data(group_1_data, group_2_data)
    corr_matrix = np.corrcoef(group_1_data, group_2_data)
    
    fig, ax = plt.subplots(figsize=(6, 6))
    sns.heatmap(corr_matrix, annot=True, ax=ax, cmap='coolwarm', vmin=-1, vmax=1, square=True)
    ax.set_title(f"{data_name} Correlation - Group 1 vs. Group 2")
    ax.set_xticks([0.5,1.5])
    ax.set_yticks([0.5,1.5])
    ax.set_xticklabels(['Group 1', 'Group 2'])
    ax.set_yticklabels(['Group 1', 'Group 2'])
    plt.tight_layout()
    plt.show()


# Fetch data for all combinations
for col_name in columns_of_interest:
    # Replace NaN values with 0 and convert to numpy array
    group_1_data = [0 if x is np.nan else x for x in get_aggregated_data_for_group(group_1)[col_name]]
    group_2_data = [0 if x is np.nan else x for x in get_aggregated_data_for_group(group_2)[col_name]]

    if not (is_empty_or_all_zeros(group_1_data) or is_empty_or_all_zeros(group_2_data)):
        plot_correlation(group_1_data, group_2_data, col_name)
    else:
        print(f"Skipped plotting for column '{col_name}' as it's empty or contains only zeros.")


In [None]:


sampling_rate = 300
raw_signals = ['BVP', 'EDA', 'PZT']

def plot_original_vs_cleaned_z_scores(original_z, cleaned_z, signal_name, subject_name, movie_num):
    # Create a subplot
    fig = go.Figure()

    # Add the original z-scored plot
    fig.add_trace(
        go.Scatter(y=original_z, mode='lines', name='Original z-scored', line=dict(color='red'))
    )
    
    # Add the cleaned z-scored plot
    fig.add_trace(
        go.Scatter(y=cleaned_z, mode='lines', name='Cleaned z-scored', line=dict(color='green'))
    )

    # Layout customization
    fig.update_layout(
        title=f'Subject {subject_name} - Movie {movie_num} - {signal_name}',
        xaxis_title="Data Points",
        yaxis_title="Value",
        legend_title="Legend"
    )
    
    # Display the figure
    fig.show()

root_dir = os.getcwd()
subject_dirs = [subj for subj in os.listdir(root_dir) if 'subject' in subj.lower()]

for subject in subject_dirs:
    subject_path = os.path.join(root_dir, subject)

    for i in range(1, num_movies + 1):  # Assuming 'num_movies' is defined elsewhere
        filepath_opensignals = os.path.join(subject_path, f"movie_{i}_opensignals.csv")
        
        if os.path.exists(filepath_opensignals):
            df = pd.read_csv(filepath_opensignals)

            for column in raw_signals:
                signal = df[column].values

                # Compute z-scores for original signals
                original_z = (signal - np.mean(signal)) / np.std(signal)

                # Cleaning process
                signal_cleaned = signal - np.mean(signal)
                
                f_value = 1 if column in ['PZT', 'BVP'] else 0.5  # Set f value depending on the signal
                signal_cleaned = bsnb.lowpass(signal_cleaned, f=f_value, order=3, fs=sampling_rate)

                # Compute z-scores for cleaned signals
                cleaned_z = (signal_cleaned - np.mean(signal_cleaned)) / np.std(signal_cleaned)

                # Add cleaned data to dataframe
                df[f"{column}_cleaned"] = cleaned_z

                # Plot original vs cleaned z-scores for the current subject and movie
                plot_original_vs_cleaned_z_scores(original_z, cleaned_z, column, subject, i)

            # Save the updated dataframe back to the csv
            df.to_csv(filepath_opensignals, index=False)
            
        else:
            print(f"File not found: {filepath_opensignals}")
