In [2]:
import pandas as pd 

In [3]:
train_DailyDialog_df = pd.read_csv('../DailyDialog_DATA/train.csv')
test_DailyDialog_df = pd.read_csv('../DailyDialog_DATA/test.csv')
val_DailyDialog_df = pd.read_csv('../DailyDialog_DATA/validation.csv')

In [4]:
import re

def clean_and_segment_dialog(dialog):
    """
    Converts a string representation of a conversation into a list of individual turns.
    Properly handles inconsistent quotes and segments each line into turns.
    """
    # Remove newlines and strip leading/trailing whitespace
    dialog = dialog.replace("\n", " ").strip()
    
    # Remove the outer brackets if they exist
    if dialog.startswith("[") and dialog.endswith("]"):
        dialog = dialog[1:-1]
    
    # Use regex to find segments that are enclosed in quotes, which typically represent individual turns
    turns = re.findall(r"(?:'([^']+)'|\"([^\"]+)\")", dialog)
    
    # Flatten the list of tuples and filter out empty strings
    turns = [turn for group in turns for turn in group if turn]
    
    # Ensure each turn is stripped of leading/trailing spaces
    turns = [turn.strip() for turn in turns]
    
    return turns


In [5]:
# Apply the function to the 'dialog' column
train_DailyDialog_df['dialog'] = train_DailyDialog_df['dialog'].apply(clean_and_segment_dialog)
test_DailyDialog_df['dialog'] = test_DailyDialog_df['dialog'].apply(clean_and_segment_dialog)
val_DailyDialog_df['dialog'] = val_DailyDialog_df['dialog'].apply(clean_and_segment_dialog)



In [6]:
train_DailyDialog_df.head()

Unnamed: 0,dialog,act,emotion
0,"[Say , Jim , how about going for a few beers a...",[3 4 2 2 2 3 4 1 3 4],[0 0 0 0 0 0 4 4 4 4]
1,"[Can you do push-ups ?, Of course I can . It's...",[2 1 2 2 1 1],[0 0 6 0 0 0]
2,"[Can you study with the radio on ?, No , I lis...",[2 1 2 1 1],[0 0 0 0 0]
3,"[Are you all right ?, I will be all right soon...",[2 1 1 1],[0 0 0 0]
4,"[Hey John , nice skates . Are they new ?, Yeah...",[2 1 2 1 1 2 1 3 4],[0 0 0 0 0 6 0 6 0]


In [7]:
def split_dialog_into_turns(df):
    new_data = []
    dialogue_id = 0  # Start dialogue ID at 0
    num_mismatch = 0
    for _, row in df.iterrows():
        turns = row['dialog']
        emotions = row['emotion']
        
        # Convert emotions to a list if it's not already one
        if isinstance(emotions, str):
            emotions = [int(e) for e in emotions.strip('[]').split()]
        
        # Check if the number of turns matches the number of emotions
        if len(turns) != len(emotions):
            num_mismatch += 1
            break
            # raise ValueError(f"Mismatch between number of turns ({len(turns)}) and number of emotions ({len(emotions)}) for dialogue ID {dialogue_id}")
        
        # Create new rows for each turn
        for turn, emotion in zip(turns, emotions):
            new_data.append({
                'dialogue_id': dialogue_id,
                'turn': turn,
                'emotion': emotion
            })
        
        dialogue_id += 1  # Increment dialogue ID for the next set of turns
    
    # Create a new DataFrame
    new_df = pd.DataFrame(new_data)
    print(num_mismatch)
    return new_df


In [8]:
# Apply the function to the 'dialog' column
train_DailyDialog_df = split_dialog_into_turns(train_DailyDialog_df)
test_DailyDialog_df = split_dialog_into_turns(test_DailyDialog_df)
val_DailyDialog_df = split_dialog_into_turns(val_DailyDialog_df)

1
0
1


In [9]:
def append_alternating_speaker_ids(df, column_name='speaker_id'):
    """
    Appends a column of alternating 0s and 1s to the given DataFrame.
    
    :param df: The input DataFrame to which the column will be added.
    :param column_name: The name of the new column to be added (default is 'speaker_id').
    :return: The DataFrame with the new column added.
    """
    length = len(df)  # Get the number of rows in the DataFrame
    speaker_ids = [i % 2 for i in range(length)]  # Create a list of alternating 0s and 1s
    df[column_name] = speaker_ids  # Add the new column to the DataFrame
    return df

In [10]:
train_DailyDialog_df = append_alternating_speaker_ids(train_DailyDialog_df)
test_DailyDialog_df = append_alternating_speaker_ids(test_DailyDialog_df)
val_DailyDialog_df = append_alternating_speaker_ids(val_DailyDialog_df)

In [11]:
train_DailyDialog_df.head()

Unnamed: 0,dialogue_id,turn,emotion,speaker_id
0,0,"Say , Jim , how about going for a few beers af...",0,0
1,0,You know that is tempting but is really not go...,0,1
2,0,What do you mean ? It will help us to relax .,0,0
3,0,Do you really think so ? I don't . It will jus...,0,1
4,0,I guess you are right.But what shall we do ? I...,0,0


In [27]:
import importlib
import embedding_functions
importlib.reload(embedding_functions)

<module 'embedding_functions' from '/home/manuel.nunez/VANESSA/GroupCohesionPrediction/graphs/embedding_functions.py'>

In [28]:
def create_dialogue_graphs(df):

    dialogues = df.groupby('dialogue_id')
    dialogue_graphs = []

    # Create a placeholder for null embeddings (assuming embeddings are vectors)
    null_embedding = [0] * 768  

    # To iterate over each group
    for _, dialogue in dialogues:

        participants = dialogue['speaker_id'].unique()
        G = []

        for index, row in dialogue.iterrows():
        
            G_i = {'X': {}, 'Y': 'Uninitialized'}
            G_i['Y'] = row['emotion']
            speaker = row['speaker_id']

            # Initialize speaker nodes for each timestamp
            for p in participants:
                if p == speaker:
                    speaker_embedding = embedding_functions.embedding_func(row['turn'])
                    speaker_edges = [f'{node}_{index}' for node in participants if node != speaker]
                    G_i['X'][f'{p}_{index}'] = {'embedding': speaker_embedding, 'edges': speaker_edges}
                else:    
                    G_i['X'][f'{p}_{index}'] = {'embedding': null_embedding, 'edges': []}

            G.append(G_i)

        dialogue_graphs.append(G)


    return dialogue_graphs

In [29]:
DailyDialog_val_dialogue_graphs = create_dialogue_graphs(val_DailyDialog_df)
DailyDialog_train_dialogue_graphs = create_dialogue_graphs(train_DailyDialog_df)
DailyDialog_test_dialogue_graphs = create_dialogue_graphs(test_DailyDialog_df)

In [31]:
import pickle

with open('data/ERC/DailyDialog/val_dialogue_graphs.pkl', 'wb') as f:
    pickle.dump(DailyDialog_val_dialogue_graphs, f)

with open('data/ERC/DailyDialog/train_dialogue_graphs.pkl', 'wb') as f:
    pickle.dump(DailyDialog_train_dialogue_graphs, f)

with open('data/ERC/DailyDialog/test_dialogue_graphs.pkl', 'wb') as f:
    pickle.dump(DailyDialog_test_dialogue_graphs, f)