In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from transformers import BertTokenizerFast


In [2]:
file_path ="/kaggle/input/emotion-cause-dataset/Emotion_Cause_Dataset.txt"

In [3]:
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased', return_token_type_ids=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



In [4]:
import re

# Load the dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    data = []
    for line in lines:
        line = line.strip()
        
        # Extract emotion
        emotion_match = re.search(r"<(.*?)>", line)
        emotion = emotion_match.group(1) if emotion_match else None
        question = f"Why does this sentence express {emotion}?"
        
        # Extract cause (handling <cause> and <\cause>)
        cause_match = re.search(r"<cause>(.*?)<\\cause>", line, re.DOTALL)
        cause = cause_match.group(1).strip() if cause_match else None
        
        # Remove tags for clean text
        context = re.sub(r"<.*?>", "", line).strip()
              
        if emotion:
            data.append({"context": context, "label": emotion, "question":question,"answer": cause})
    
    return data


In [5]:
data = load_dataset(file_path)

In [6]:
data[:5]

[{'context': "I suppose I am happy  being so ` tiny' ; it means I am able to surprise people with what is generally seen as my confident and outgoing personality",
  'label': 'happy',
  'question': 'Why does this sentence express happy?',
  'answer': "being so ` tiny'"},
 {'context': 'Lennox has always truly wanted to fight for the world title and was happy  taking the tough route',
  'label': 'happy',
  'question': 'Why does this sentence express happy?',
  'answer': 'taking the tough route'},
 {'context': 'He was a professional musician now , still sensitive and happy  doing something he loved',
  'label': 'happy',
  'question': 'Why does this sentence express happy?',
  'answer': 'doing something he loved'},
 {'context': 'Holmes is happy  having the freedom of the house when we are out',
  'label': 'happy',
  'question': 'Why does this sentence express happy?',
  'answer': 'having the freedom of the house when we are out'},
 {'context': 'I had problems with tutors trying to encourag

In [7]:
df = pd.DataFrame(data)

# Display the new format
print(df.sample(n=5))

                                                context  label  \
116   He was exhilarated  when he heard Ornette Cole...  happy   
123   On reaching the Fish I let out a cry of triump...  happy   
422       I felt happy  to go on a weekend camping trip  happy   
2374  He felt the weight of his mistake when he saw ...  shame   
26    Do you have a delight  in animals and skill in...  happy   

                                   question  \
116   Why does this sentence express happy?   
123   Why does this sentence express happy?   
422   Why does this sentence express happy?   
2374  Why does this sentence express shame?   
26    Why does this sentence express happy?   

                                                 answer  
116   when he heard Ornette Coleman improvising free...  
123   to have got there while conveniently forgettin...  
422                     to go on a weekend camping trip  
2374  when he saw how his actions had affected his r...  
26                            

In [8]:
def find_idx (big_index,small_index):

    """
    Find the starting indices of a sequence of 'small_index' within 'big_index'.
    Parameters:
    - big_index (list): The larger sequence of indices.
    - small_index (list): The smaller sequence of indices to be found within 'big_index'.
    Returns:
    - list: A list of starting indices where 'small_index' is found in 'big_index'.
    """
    # Iterate through each index in 'big_index'

    for i in range(len(big_index)):
        # Initialize an empty list to store starting indices
        indices = []
        
        # Check if the current index in 'big_index' matches the first index in 'small_index'
        if big_index[i] == small_index[0]:

            # If there is a match, append the current index to 'indices'
            indices.append(i)

            # If the length of 'small_index' is greater than 1, check for the entire sequence
            if len(small_index)>1:
                j = 1

                # Continue checking subsequent indices for a match with 'small_index'
                while len(small_index)>j and big_index[i+j] == small_index[j]:
                    indices.append(j+i)
                    j += 1

                if len(small_index) == j:
                    return indices
                    break
            else:
                return [i]
                break
                

In [9]:
def file_add(x):
    """
    Tokenize the input question and context using BERT tokenizer and find the token indices corresponding to the answer within the tokenized sequence.
    Parameters:
    - x (dict): Input dictionary containing 'question', 'context', and 'answer' keys.
    Returns:
    - tuple: A tuple containing the starting and ending token indices of the answer within the tokenized sequence.
             If the answer is not found, it returns (-1, -1).
    """
    # Tokenize the question and context using BERT tokenizer
    qst_contxt = bert_tokenizer.encode(x['question'],x['context'])
    try:

        # Tokenize the answer
        answr = bert_tokenizer.encode(x['answer'])[1:-1]

        # Find the indices of the answer within the tokenized question and context
        answr_idx = find_idx (qst_contxt,answr)

        try:
            # If multiple indices are found, use the first and last indices
            if len(answr_idx)>1:
                tkn_strt,tkn_end = answr_idx[0], answr_idx[-1]
 # If only one index is found, use it for both start and end
            else :
                tkn_strt,tkn_end = answr_idx[0], answr_idx[0]
        except TypeError:
            # Handle the case where answr_idx is not a list (Type Error)
            tkn_strt,tkn_end = -1, -1
            
        # Return the starting and ending token indices of the answer
        return tkn_strt, tkn_end
        
    except TypeError:
        # Handle the case where answr is not properly defined (Type Error)
        return -1, -1

In [10]:
tmp = df.apply(lambda x: file_add(x), axis=1)

df['start_positions'], df['end_positions'] = [i[0] for i in tmp], [i[1] for i in tmp]

df= df[['context','label','question','start_positions','end_positions','answer']]

df[:4]

Unnamed: 0,context,label,question,start_positions,end_positions,answer
0,I suppose I am happy being so ` tiny' ; it me...,happy,Why does this sentence express happy?,14,18,being so ` tiny'
1,Lennox has always truly wanted to fight for th...,happy,Why does this sentence express happy?,23,26,taking the tough route
2,"He was a professional musician now , still sen...",happy,Why does this sentence express happy?,20,23,doing something he loved
3,Holmes is happy having the freedom of the hou...,happy,Why does this sentence express happy?,12,21,having the freedom of the house when we are out


In [11]:
label_counts= df['label'].value_counts()

In [12]:
label_counts

label
happy       443
sad         393
shame       348
anger       339
fear        312
surprise    307
disgust     306
Name: count, dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2448 entries, 0 to 2447
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   context          2448 non-null   object
 1   label            2448 non-null   object
 2   question         2448 non-null   object
 3   start_positions  2448 non-null   int64 
 4   end_positions    2448 non-null   int64 
 5   answer           2446 non-null   object
dtypes: int64(2), object(4)
memory usage: 114.9+ KB


In [14]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [15]:
label_mapping 

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'happy': 3,
 'sad': 4,
 'shame': 5,
 'surprise': 6}

In [16]:
from sklearn.model_selection import train_test_split

# Split into train and temp (validation + test)
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=1)

# Split temp into validation and test
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=1)

# Result: train_data (80%), val_data (10%), test_data (10%)


In [17]:
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

(1958, 6)
(245, 6)
(245, 6)


In [18]:
# Assuming train_data is your DataFrame
unique_count_train = train_data['label'].nunique()

print(f"Number of unique counts: {unique_count_train}")

Number of unique counts: 7


In [19]:
# Assuming train_data is your DataFrame
unique_count_val = val_data['label'].nunique()

print(f"Number of unique counts: {unique_count_val}")

Number of unique counts: 7


In [20]:
# Assuming train_data is your DataFrame
unique_count_test = test_data['label'].nunique()

print(f"Number of unique counts: {unique_count_test}")

Number of unique counts: 7


In [21]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, 1514 to 985
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   context          245 non-null    object
 1   label            245 non-null    int64 
 2   question         245 non-null    object
 3   start_positions  245 non-null    int64 
 4   end_positions    245 non-null    int64 
 5   answer           245 non-null    object
dtypes: int64(3), object(3)
memory usage: 13.4+ KB


In [22]:
# Save to a CSV file
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)
print("dataset saved")

dataset saved
