In [33]:
import pandas as pd
import numpy as np
import statistics

## BEBRASK DATASET COUNT

In [34]:
#We read the original datasets for the tasks of BEBRASK
BEBRASK_tasks = pd.read_excel('../Datasets/BEBRASK_task.xlsx')
BEBRASK_tasks = BEBRASK_tasks[["DataFile.Basename","Rating0.RESP","Rating.RESP","EvokedEmotion", "Expression", "ScenarioPick","TrialCount","Fulfilled"]].copy()
BEBRASK_tasks.dropna(subset = ["TrialCount"],inplace=True)
subjects_id = BEBRASK_tasks["DataFile.Basename"].unique()

#We choose the relevant variables for the analysis

In [35]:
BEBRASK_tasks_long= BEBRASK_tasks.pivot(index="DataFile.Basename", columns='TrialCount')
BEBRASK_tasks_long.columns = ['_'.join(str(col) for col in cols) for cols in BEBRASK_tasks_long.columns.values]
BEBRASK_tasks_long.head(5)
#We pivot based on the trial count, so that each row corresponds to 1 subject

Unnamed: 0_level_0,Rating0.RESP_1.0,Rating0.RESP_2.0,Rating0.RESP_3.0,Rating0.RESP_4.0,Rating0.RESP_5.0,Rating0.RESP_6.0,Rating0.RESP_7.0,Rating0.RESP_8.0,Rating0.RESP_9.0,Rating0.RESP_10.0,...,Fulfilled_36.0,Fulfilled_37.0,Fulfilled_38.0,Fulfilled_39.0,Fulfilled_40.0,Fulfilled_41.0,Fulfilled_42.0,Fulfilled_43.0,Fulfilled_44.0,Fulfilled_45.0
DataFile.Basename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PREDWELL_RETOS-1001-1,3.0,3.0,1.0,4.0,2.0,3.0,4.0,1.0,3.0,4.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
PREDWELL_RETOS-1002-1,3.0,3.0,1.0,1.0,2.0,1.0,3.0,4.0,3.0,2.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
PREDWELL_RETOS-1003-1,2.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,2.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
PREDWELL_RETOS-1004-1,2.0,3.0,4.0,4.0,1.0,3.0,1.0,1.0,3.0,2.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
PREDWELL_RETOS-1005-1,2.0,1.0,2.0,1.0,1.0,4.0,3.0,4.0,1.0,3.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0


In [36]:
BEBRASK_tasks_long.to_excel('../Clustering_Predictive_Processing/BEBRASK_long.xlsx', index=False)
BEBRASK_tasks_long_copy = BEBRASK_tasks_long.copy()

In [37]:
import numpy as np

def creation_dictionary_from_df(df, n_subjects):
    """
    Processes a DataFrame to create a dictionary that organizes data related to emotions (happiness, sadness, fear) for a specified number of subjects.
    
    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the survey data. Each row represents one subject.
    - n_subjects (int): The number of subjects to process from the DataFrame.
    
    Returns:
    - dict_df (dict): A dictionary where each key is a subject identifier and its value is another dictionary. This 
      nested dictionary categorizes the data into emotions 'Happy', 'Sad', and 'Fear'. Each emotion key is then mapped to another 
      dictionary with keys 'Rating0' (Rating0.RESP), 'Rating' (Rating.RESP), 'Expression' (Expression), and 'Fulfillment' (Fulfillment).
    """
    # Initialize the dictionary to hold the processed data
    dict_df = {}
    
    # Loop through each subject up to the specified number
    for j in range(0, n_subjects):
        # Extract the subject data and their identifier
        subject = df.iloc[j]
        id = df.index.values[j]
        
        # Initialize lists to hold the data for each emotion
        happy_pred, happy_like, happy_facial, happy_fulfill = [], [], [], []
        sad_pred, sad_like, sad_facial, sad_fulfill = [], [], [], []
        fear_pred, fear_like, fear_facial, fear_fulfill = [], [], [], []
        
        # Loop through each time point (1 to 45)
        for i in range(1, 46):
            # Define the column names for predictions, ratings, emotions, facial expressions, and fulfillment status
            pred = f'Rating0.RESP_{i}.0'
            like = f'Rating.RESP_{i}.0'
            emotion = f'EvokedEmotion_{i}.0'
            facial = f'Expression_{i}.0'
            fulfill = f'Fulfilled_{i}.0'
            
            # Categorize the data based on the evoked emotion
            if subject[emotion] == "happiness":
                happy_pred.append(subject[pred] if not np.isnan(subject[pred]) else None)
                happy_like.append(subject[like] if not np.isnan(subject[like]) else None)
                happy_facial.append(subject[facial])
                happy_fulfill.append(subject[fulfill])
                
            elif subject[emotion] == "sadness":
                sad_pred.append(subject[pred] if not np.isnan(subject[pred]) else None)
                sad_like.append(subject[like] if not np.isnan(subject[like]) else None)
                sad_facial.append(subject[facial])
                sad_fulfill.append(subject[fulfill])
    
            else:  # Default case for fear
                fear_pred.append(subject[pred] if not np.isnan(subject[pred]) else None)
                fear_like.append(subject[like] if not np.isnan(subject[like]) else None)
                fear_facial.append(subject[facial])
                fear_fulfill.append(subject[fulfill])
    
        # Assign the processed data to the dictionary for the current subject
        dict_df[id] = {
            'Happy': {
                'Rating0': np.array(happy_pred, dtype=object), 
                'Rating': np.array(happy_like, dtype=object), 
                'Expression': np.array(happy_facial, dtype=object),
                'Fulfill': np.array(happy_fulfill, dtype=object)
            },
            'Sad': { 
                'Rating0': np.array(sad_pred, dtype=object), 
                'Rating': np.array(sad_like, dtype=object), 
                'Expression': np.array(sad_facial, dtype=object),
                'Fulfill': np.array(sad_fulfill, dtype=object)
            }, 
            'Fear': {
                'Rating0': np.array(fear_pred, dtype=object),
                'Rating': np.array(fear_like, dtype=object),
                'Expression': np.array(fear_facial, dtype=object), 
                'Fulfill': np.array(fear_fulfill, dtype=object)
            }
        }
    
    return dict_df

# Process the data for all subjects in the DataFrame
subject_dict_BEBRASK = creation_dictionary_from_df(BEBRASK_tasks_long, len(BEBRASK_tasks_long))


## RETOS DATASET COUNT


In [38]:
#We read the original datasets for the tasks of RETOS

RETOS_tasks = pd.read_excel('../Datasets/RETOS_task.xlsx')
RETOS_tasks = RETOS_tasks[
    ["DataFile.Basename", "Rating0.RESP", "Rating.RESP", "EvokedEmotion", "Expression", "ScenarioPick", "TrialCount",
     "Fulfilled"]].copy()
RETOS_tasks.dropna(subset=["TrialCount"], inplace=True)
subjects_id = RETOS_tasks["DataFile.Basename"].unique()

#We choose the relevant variables for the analysis

In [39]:
RETOS_tasks_long= RETOS_tasks.pivot(index="DataFile.Basename", columns='TrialCount')
RETOS_tasks_long.columns = ['_'.join(str(col) for col in cols) for cols in RETOS_tasks_long.columns.values]

RETOS_tasks_long.head(5)

Unnamed: 0_level_0,Rating0.RESP_1.0,Rating0.RESP_2.0,Rating0.RESP_3.0,Rating0.RESP_4.0,Rating0.RESP_5.0,Rating0.RESP_6.0,Rating0.RESP_7.0,Rating0.RESP_8.0,Rating0.RESP_9.0,Rating0.RESP_10.0,...,Fulfilled_36.0,Fulfilled_37.0,Fulfilled_38.0,Fulfilled_39.0,Fulfilled_40.0,Fulfilled_41.0,Fulfilled_42.0,Fulfilled_43.0,Fulfilled_44.0,Fulfilled_45.0
DataFile.Basename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PREDWELL_RETOS-1-1,1.0,3.0,4.0,4.0,4.0,2.0,1.0,4.0,4.0,4.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
PREDWELL_RETOS-10-1,4.0,2.0,2.0,4.0,3.0,4.0,3.0,1.0,1.0,3.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
PREDWELL_RETOS-101-1,2.0,3.0,3.0,4.0,4.0,1.0,1.0,4.0,3.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
PREDWELL_RETOS-102-1,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,4.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
PREDWELL_RETOS-103-1,3.0,1.0,2.0,2.0,1.0,1.0,3.0,1.0,2.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [40]:
#Data from PREDWELL_RETOS-307-1 and PREDWELL_RETOS-307-3 are from the same subject, therefore they should be combined and treated as a single row.

part1 = RETOS_tasks_long.iloc[RETOS_tasks_long.index =='PREDWELL_RETOS-307-1'].T
part2 = RETOS_tasks_long.iloc[RETOS_tasks_long.index =='PREDWELL_RETOS-307-3'].T
combined_row = np.where(part1.isnull(), part2, part1).T
RETOS_tasks_long.iloc[RETOS_tasks_long.index == 'PREDWELL_RETOS-307-1'] = combined_row


RETOS_tasks_long = RETOS_tasks_long.query("index != 'PREDWELL_RETOS-307-3'")

#We delete the excess row from combination.


In [41]:
RETOS_tasks_long.to_excel('../Clustering_Predictive_Processing/RETOS_long.xlsx', index=False)
RETOS_tasks_long_copy = RETOS_tasks_long.copy()


## BEBRASK & RETOS Long Merge


In [42]:
RETOS_BEBRASK_long = pd.concat([RETOS_tasks_long_copy,BEBRASK_tasks_long_copy])
RETOS_BEBRASK_long.to_excel('../Clustering_Predictive_Processing/RETOS_BEBRASK_long.xlsx', index=True)
RETOS_BEBRASK_long.head(5)

Unnamed: 0_level_0,Rating0.RESP_1.0,Rating0.RESP_2.0,Rating0.RESP_3.0,Rating0.RESP_4.0,Rating0.RESP_5.0,Rating0.RESP_6.0,Rating0.RESP_7.0,Rating0.RESP_8.0,Rating0.RESP_9.0,Rating0.RESP_10.0,...,Fulfilled_36.0,Fulfilled_37.0,Fulfilled_38.0,Fulfilled_39.0,Fulfilled_40.0,Fulfilled_41.0,Fulfilled_42.0,Fulfilled_43.0,Fulfilled_44.0,Fulfilled_45.0
DataFile.Basename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PREDWELL_RETOS-1-1,1.0,3.0,4.0,4.0,4.0,2.0,1.0,4.0,4.0,4.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
PREDWELL_RETOS-10-1,4.0,2.0,2.0,4.0,3.0,4.0,3.0,1.0,1.0,3.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
PREDWELL_RETOS-101-1,2.0,3.0,3.0,4.0,4.0,1.0,1.0,4.0,3.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
PREDWELL_RETOS-102-1,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,4.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
PREDWELL_RETOS-103-1,3.0,1.0,2.0,2.0,1.0,1.0,3.0,1.0,2.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [43]:
RETOS_BEBRASK_long

Unnamed: 0_level_0,Rating0.RESP_1.0,Rating0.RESP_2.0,Rating0.RESP_3.0,Rating0.RESP_4.0,Rating0.RESP_5.0,Rating0.RESP_6.0,Rating0.RESP_7.0,Rating0.RESP_8.0,Rating0.RESP_9.0,Rating0.RESP_10.0,...,Fulfilled_36.0,Fulfilled_37.0,Fulfilled_38.0,Fulfilled_39.0,Fulfilled_40.0,Fulfilled_41.0,Fulfilled_42.0,Fulfilled_43.0,Fulfilled_44.0,Fulfilled_45.0
DataFile.Basename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PREDWELL_RETOS-1-1,1.0,3.0,4.0,4.0,4.0,2.0,1.0,4.0,4.0,4.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
PREDWELL_RETOS-10-1,4.0,2.0,2.0,4.0,3.0,4.0,3.0,1.0,1.0,3.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
PREDWELL_RETOS-101-1,2.0,3.0,3.0,4.0,4.0,1.0,1.0,4.0,3.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
PREDWELL_RETOS-102-1,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,4.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
PREDWELL_RETOS-103-1,3.0,1.0,2.0,2.0,1.0,1.0,3.0,1.0,2.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PREDWELL_RETOS-1124-1,2.0,2.0,3.0,1.0,3.0,3.0,1.0,1.0,2.0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
PREDWELL_RETOS-1125-1,1.0,,3.0,3.0,3.0,1.0,4.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
PREDWELL_RETOS-1126-1,2.0,3.0,4.0,1.0,4.0,4.0,4.0,4.0,4.0,4.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
PREDWELL_RETOS-1127-1,1.0,3.0,2.0,3.0,1.0,4.0,4.0,4.0,1.0,3.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


## Joining Scales RETOS & BEBRASK

In [44]:
BEBRASK_scales = pd.read_excel('../Datasets/BEBRASK_scales.xlsx')
RETOS_scales = pd.read_excel('../Datasets/RETOS_scales.xlsx')


In [45]:
RETOS_BEBRASK_scales = pd.concat([RETOS_scales,BEBRASK_scales])


cols = RETOS_BEBRASK_scales.columns

#We convert all data to numeric, and change to NaN those values that can not be converted
RETOS_BEBRASK_scales[cols[3:]] = RETOS_BEBRASK_scales[cols[3:]].apply(pd.to_numeric, errors='coerce')
RETOS_BEBRASK_scales.reset_index(drop=True, inplace=True)
nan_counts = RETOS_BEBRASK_scales.isna().sum()

# Identify columns where the count of NaN is less than or equal to 50
columns_with_fewer_nans = nan_counts[nan_counts <= 50].index.tolist()

# Filter the DataFrame to include only these columns
RETOS_BEBRASK_scales = RETOS_BEBRASK_scales[columns_with_fewer_nans]


RETOS_BEBRASK_scales["ASI_T"] = RETOS_BEBRASK_scales["ASI_P"]+RETOS_BEBRASK_scales["ASI_C"]+RETOS_BEBRASK_scales["ASI_S"]


In [46]:

RETOS_BEBRASK_scales.to_excel('../Clustering_Predictive_Processing/scales.xlsx', index=True)
