In [1]:
import os
import pandas as pd
import numpy as np
import json

In [28]:
meta_df = pd.read_csv('MEISD/MEISD_text.csv')

In [29]:
meta_df.shape

(20017, 15)

In [30]:
meta_df.head(3)

Unnamed: 0,TV Series,Utterances,dialog_ids,uttr_ids,seasons,episodes,start_times,end_times,sentiment,emotion,intensity,emotion2,intensity2,emotion3,intensity3
0,GA,look around you,1,0,1,1,00:02:27:589,00:02:28:567,neutral,neutral,,,,,
1,GA,say hello to your competition,1,1,1,1,00:02:28:910,00:02:30:513,neutral,neutral,,,,,
2,GA,eight of you will switch to an easier specialty,1,2,1,1,00:02:31:387,00:02:34:060,neutral,neutral,,,,,


In [31]:
# For the multilabel classification we use:
columns = ['Utterances', 'sentiment', 'emotion', 'intensity', 'emotion2', 'intensity2', 'emotion3', 'intensity3']
meta_dfs = meta_df[columns].copy()

In [32]:
# Define emotion and sentiment mappings
emotion_map = {
    'neutral': 0,
    'acceptance': 1,
    'disgust': 2,
    'surprise': 3,
    'joy': 4,
    'sadness': 5,
    'anger': 6,
    'like': 7,
    'fear': 8
}

sentiment_map = {
    'positive': 1,
    'negative': 0,
    'neutral': 2
}

In [33]:
meta_dfs.loc[:, 'sentiment'] = pd.to_numeric(meta_dfs['sentiment'].map(sentiment_map).fillna(2).astype(int))  # Use 2 as a default for unknown
meta_dfs.loc[:, 'emotion'] = pd.to_numeric(meta_dfs['emotion'].map(emotion_map))
meta_dfs.loc[:, 'emotion2'] = pd.to_numeric(meta_dfs['emotion2'].map(emotion_map))
meta_dfs.loc[:, 'emotion3'] = pd.to_numeric(meta_dfs['emotion3'].map(emotion_map))

# Convert intensity columns to numeric
meta_dfs.loc[:, 'intensity'] = pd.to_numeric(meta_dfs['intensity'], errors='coerce')
meta_dfs.loc[:, 'intensity2'] = pd.to_numeric(meta_dfs['intensity2'], errors='coerce')
meta_dfs.loc[:, 'intensity3'] = pd.to_numeric(meta_dfs['intensity3'], errors='coerce')

In [36]:
# Create new columns
# 1. Combined sentiment column (0-2)
meta_dfs['combined_sentiment'] = meta_dfs['sentiment']

# 2. Combined emotions column (as a list of int)
meta_dfs['combined_emotions'] = meta_dfs[['emotion', 'emotion2', 'emotion3']].apply(lambda x: x.dropna().unique().astype(int).tolist(), axis=1)

# 3. Combined intensity column (as a list of int)
meta_dfs['combined_intensities'] = meta_dfs[['intensity', 'intensity2', 'intensity3']].apply(lambda x: x.dropna().unique().astype(int).tolist(), axis=1)

In [37]:
meta_dfs.head(50)

Unnamed: 0,Utterances,sentiment,emotion,intensity,emotion2,intensity2,emotion3,intensity3,combined_sentiment,combined_emotions,combined_intensities
0,look around you,2,0.0,,,,,,2,[0],[]
1,say hello to your competition,2,0.0,,,,,,2,[0],[]
2,eight of you will switch to an easier specialty,2,0.0,,,,,,2,[0],[]
3,five of you will crack under the pressure,2,0.0,,,,,,2,[0],[]
4,two of you will be asked to leave,2,0.0,,,,,,2,[0],[]
5,this is your starting line,2,0.0,,,,,,2,[0],[]
6,this is your arena,2,0.0,,,,,,2,[0],[]
7,how well you play,2,0.0,,,,,,2,[0],[]
8,that's up to you,2,0.0,,,,,,2,[0],[]
9,like i said,2,1.0,1.0,2.0,1.0,,,2,"[1, 2]",[1]


In [41]:
meta_dfs['intensity3'].unique()

array([nan,  2.,  1.,  3.])

In [56]:
# Create separate binary vectors for sentiment, emotions, and intensity
def create_binary_vectors(row):
    # Initialize binary vector for sentiment (3 values)
    sentiment_vector = [0, 0, 0]  # For sentiment values 0, 1, and 2
    if pd.notna(row['sentiment']) and row['sentiment'] < 3:  # Check if sentiment is not NaN and within range
        sentiment_vector[int(row['sentiment'])] = 1

    # Initialize binary vector for emotions (9 values)
    emotion_vector = [0] * 9  # For 9 emotion values
    for emotion in [row['emotion'], row['emotion2'], row['emotion3']]:
        if pd.notna(emotion) and 0 <= emotion < len(emotion_vector):  # Check if emotion is not NaN and within range
            emotion_vector[int(emotion)] = 1

    # Initialize binary vector for intensity (adjust size based on your specific range)
    intensity_vector = [0] * 4  # Assuming intensity values are 0 to 3; adjust as necessary
    for intensity in [row['intensity'], row['intensity2'], row['intensity3']]:
        if pd.notna(intensity) and 0 <= intensity < len(intensity_vector):  # Check if intensity is not NaN and within range
            intensity_vector[int(intensity)] = 1

    # Return separate vectors
    return {
        'sentiment_vector': sentiment_vector,
        'emotion_vector': emotion_vector,
        'intensity_vector': intensity_vector
    }

# Apply the function and create new columns for the vectors
binary_vectors = meta_df.apply(create_binary_vectors, axis=1)

# Expand the dictionary into separate DataFrame columns
meta_dfs['sentiment_vector'] = binary_vectors.apply(lambda x: x['sentiment_vector'])
meta_dfs['emotion_vector'] = binary_vectors.apply(lambda x: x['emotion_vector'])
meta_dfs['intensity_vector'] = binary_vectors.apply(lambda x: x['intensity_vector'])


TypeError: '<' not supported between instances of 'str' and 'int'

In [45]:
# Apply the function to create a new binary vector column
meta_dfs['binary_vector'] = meta_dfs.apply(create_binary_vector, axis=1)

In [48]:
meta_dfs.head(20)

Unnamed: 0,Utterances,sentiment,emotion,intensity,emotion2,intensity2,emotion3,intensity3,combined_sentiment,combined_emotions,combined_intensities,binary_vector
0,look around you,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,say hello to your competition,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,eight of you will switch to an easier specialty,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,five of you will crack under the pressure,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,two of you will be asked to leave,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,this is your starting line,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,this is your arena,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,how well you play,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
8,that's up to you,2,0.0,,,,,,2,[0],[],"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9,like i said,2,1.0,1.0,2.0,1.0,,,2,"[1, 2]",[1],"[0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"


In [49]:
columns = ['Utterances', 'sentiment', 'combined_emotion', 'combined_intensity', 'binary_vector']
meta_dfs = meta_df[columns]

KeyError: "['combined_emotion', 'combined_intensity', 'binary_vector'] not in index"

In [55]:
meta_dfs.shape

(20017, 12)