<a href="https://colab.research.google.com/github/MortezaMahdaviMortazavi/EmotionRecognition/blob/master/EmoPars_Emotion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
import torch.nn as nn

In [None]:
batch_size = 64
learning_rate = 1e-5
num_epochs = 3

In [None]:
# Load the pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')


Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
test = pd.read_csv('test_emoPars.csv', index_col=0)

In [None]:
train = pd.read_csv('train_emoPars.csv', index_col=0)

In [None]:
train.head()

Unnamed: 0,text,Anger,Fear,Happiness,Hatred,Sadness,Wonder,primary_emotion
0,کرونا رو شکست میدهیم؟\nمرحله بعد چه گوهی میخوا...,4,3,1,3,3,4,0
2,یکی از پدرسوختگی های #برانداز اینه که ظاهرا ژس...,5,3,2,4,5,2,0
3,یکی از دوستای دبستانم,1,0,0,1,0,0,6
4,@username اینقدر گرفتار مسایل میشی که تخصص از ...,2,1,0,1,0,4,5
5,شماهایی که توییتها رو میبینید و سکوت میکنید ش...,2,0,0,3,3,3,3


'Anger':0, 'Fear':1, 'Happiness':2, 'Hatred':3, 'Sadness':4, 'Wonder':5, 'Other':6

In [None]:
# test = test.drop(['text', 'Unnamed: 0'], axis=1)
test.head()

Unnamed: 0,text,Anger,Fear,Happiness,Hatred,Sadness,Wonder,primary_emotion
17173,من خیلی خودسانسوری می‌کنم تو اینستا. هر چی فال...,1,1,2,1,0,2,2
28360,بعد اتمام جلسه مجلس روند بازار برگشت #بورس,3,0,1,2,1,1,0
18990,کاربران توییتر در جریان طوفان توییتری اعتراض ...,4,0,0,2,2,0,0
1948,وحشی شدن معده بعد از رسیدن به ایران اجتناب ناپ...,1,0,1,0,0,1,6
10283,( سحام نیوز ): بیانیه مشترک عربستان و امارات: ...,0,0,0,1,1,0,6


In [None]:
targets = ["Anger", "Fear", "Happiness", "Hatred", "Sadness", "Wonder"]

Preprocess

In [None]:
def preprocess(df):
    # Normalize emotion labels
    emotion_columns = ['Anger', 'Fear', 'Happiness', 'Hatred', 'Sadness', 'Wonder']

    for col in emotion_columns:
        df[col] = df[col] / df[col].max()  # Normalize to the range [0, 1]

    # Apply threshold for binary labels
    threshold = 0.35
    for col in emotion_columns:
        df[col] = df[col].apply(lambda x: 1 if x >= threshold else 0)

    return df

In [None]:
train = preprocess(train)
test = preprocess(test)

In [None]:
X_train, y_train = train['text'].values.tolist(), train[targets].values.tolist()
X_test, y_test = test['text'].values.tolist(), test[targets].values.tolist()

In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, labels, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        inputs = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.float32)  # Use the provided numeric label directly
        }

        return inputs


In [None]:
train_dataset = TextDataset(tokenizer,X_train,y_train,max_length=128)
test_dataset = TextDataset(tokenizer,X_test,y_test,max_length=128)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# BertForSequenceClassification
from transformers import XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    # Specify number of classes
    num_labels = 6
)

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch.nn as nn

# Define the loss function
criterion = nn.BCEWithLogitsLoss()

In [None]:
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = model.to(device)

In [None]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(input_ids, attention_mask).logits
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

    print("Epoch:", epoch, "loss:", loss.item())




Epoch: 0 loss: 0.45309561491012573
Epoch: 1 loss: 0.5078866481781006
Epoch: 2 loss: 0.4640763998031616


In [None]:
model.save_pretrained("emotion_detection_model")

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

model.eval()
with torch.no_grad():
    all_true_labels = []
    all_predicted_labels = []

    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask).logits

        # Apply sigmoid activation to logits to get predicted probabilities
        predicted_probs = torch.sigmoid(logits)

        # Apply a threshold (e.g., 0.5) to convert probabilities to binary predictions
        threshold = 0.5
        predicted_labels = (predicted_probs > threshold).to(torch.float32)

        # Convert labels and predictions to numpy arrays
        true_labels = labels.cpu().numpy()
        predicted_labels = predicted_labels.cpu().numpy()

        # Append the true and predicted labels to the lists
        all_true_labels.extend(true_labels)
        all_predicted_labels.extend(predicted_labels)


In [None]:
true_labels = all_true_labels
predicted_labels = all_predicted_labels

In [None]:
import numpy as np

In [None]:
# Calculate accuracy for each label
label_accuracies = []
for i in range(len(true_labels)):
    label_accuracy = accuracy_score(true_labels[i], predicted_labels[i])
    label_accuracies.append(label_accuracy)

# Calculate macro F1 score for each label and take the average
label_macro_f1_scores = []
for i in range(len(true_labels)):
    label_macro_f1 = f1_score(true_labels[i], predicted_labels[i], average='macro')
    label_macro_f1_scores.append(label_macro_f1)

# Calculate the average accuracy and macro F1 score
average_accuracy = np.mean(label_accuracies)
average_macro_f1 = np.mean(label_macro_f1_scores)

print("Average Accuracy:", average_accuracy)
print("Average Macro F1 Score:", average_macro_f1)

Average Accuracy: 0.8223333333333334
Average Macro F1 Score: 0.6199137085137085


In [None]:
test_data = pd.read_csv('3. data_emotion_without_label.csv')

In [None]:
test_data.head()

Unnamed: 0,local_id,tweet
0,973588328225411072,لنگ پولیس ۱ برق شیراز دوبی ۰\nخداییش تیم...
1,983807604982996995,دوستان بارسایی. \nحالا بشینید و فوتبال خوب تما...
2,1378698455305060353,والا ۱۳ روز #عید که هیچی با این شرایط یک سال ا...
3,1380360990793953282,بدبختی ماگناه #بیگانه نبود\nپیوندمن وشما #صمیم...
4,1382562339258441728,تصور کن انقدر #عشق بدی تاهمه چیز اطرافت شاخ و ...


In [None]:
model = model.to('cpu')

In [None]:
# Tokenize and predict emotions for each tweet
results = []

for j in range(len(test_data)):
    data_point = test_data.iloc[j]
    text = data_point["tweet"]
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.sigmoid(outputs.logits)

    # Find the emotion with the highest predicted value
    primary_emotion_index = torch.argmax(probabilities)
    primary_emotion = targets[primary_emotion_index]

    # Normalize and apply threshold
    threshold = 0.15
    predictions = (probabilities > threshold).cpu().numpy().tolist()[0]

    # Create a dictionary with the required information
    result = {
        "local_id": data_point["local_id"],
        "tweet": text,
        "primary_emotion": primary_emotion,
    }

    # Add emotion predictions to the dictionary
    for i, emotion in enumerate(targets):
        result[emotion] = predictions[i]

    results.append(result)

# Create a DataFrame
result_df = pd.DataFrame(results)


In [None]:
targets = ["Anger", "Fear", "Happiness", "Hatred", "Sadness", "Wonder"]

In [None]:
new_targets = ['anger'="Anger", 'sadness'="Sadness", 'surprise'="Wonder", 'happiness'="Happiness", 'fear'="Fear", 'disgust'="Hatred", 'other']

In [None]:
result_df.head()

Unnamed: 0,local_id,tweet,primary_emotion,Anger,Fear,Happiness,Hatred,Sadness,Wonder
0,973588328225411072,لنگ پولیس ۱ برق شیراز دوبی ۰\nخداییش تیم...,Happiness,False,False,False,False,False,False
1,983807604982996995,دوستان بارسایی. \nحالا بشینید و فوتبال خوب تما...,Happiness,False,False,False,False,False,False
2,1378698455305060353,والا ۱۳ روز #عید که هیچی با این شرایط یک سال ا...,Happiness,False,False,False,False,False,False
3,1380360990793953282,بدبختی ماگناه #بیگانه نبود\nپیوندمن وشما #صمیم...,Sadness,True,False,False,False,True,False
4,1382562339258441728,تصور کن انقدر #عشق بدی تاهمه چیز اطرافت شاخ و ...,Happiness,False,False,False,False,False,False


In [None]:
result_df.to_csv('resulr.csv')

In [None]:
# Mapping from primary emotions
primary_emotion_mapping = {"Anger":'anger', "Sadness":'sadness', "Wonder":'surprise', "Happiness":'happiness', "Fear":'fear', "Hatred":'disgust', "Other":'other'}

# Tokenize and predict emotions for each tweet
results = []

for j in range(len(test_data)):
    data_point = test_data.iloc[j]
    text = data_point["tweet"]
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.sigmoid(outputs.logits)

    # Find the emotion with the highest predicted value
    primary_emotion_index = torch.argmax(probabilities)
    primary_emotion = primary_emotion_mapping[targets[primary_emotion_index]]

    # Normalize and apply threshold
    threshold = 0.15
    predictions = (probabilities > threshold).cpu().numpy().tolist()[0]

    # Create a dictionary with the required information
    result = {
        "local_id": data_point["local_id"],
        "tweet": text,
        "primary_emotion": primary_emotion,
    }

    # Add emotion predictions to the dictionary
    for i, emotion in enumerate(targets):
        # Convert True/False to 1/0
        result[emotion] = int(predictions[i])

    # If all predicted scores are 0, set 'other' as primary emotion
    if all(score == 0 for score in predictions):
        result["primary_emotion"] = "Other"

    results.append(result)

# Create a DataFrame
result_df = pd.DataFrame(results)

# Print the DataFrame
print(result_df)


                local_id                                              tweet  \
0     973588328225411072  لنگ پولیس  ۱      برق شیراز دوبی ۰\nخداییش تیم...   
1     983807604982996995  دوستان بارسایی. \nحالا بشینید و فوتبال خوب تما...   
2    1378698455305060353  والا ۱۳ روز #عید که هیچی با این شرایط یک سال ا...   
3    1380360990793953282  بدبختی ماگناه #بیگانه نبود\nپیوندمن وشما #صمیم...   
4    1382562339258441728  تصور کن انقدر #عشق بدی تاهمه چیز اطرافت شاخ و ...   
..                   ...                                                ...   
495  1644747933890539521  <@USERNAME> با توجه به علاقه شدید جامعه سلبرید...   
496  1644749245377904640                    یک همدم باوفا ندیدم جز درد ...    
497  1644758722047266817  جناب،#رضاکیانیان اگربدنبال قاتل احتمالی مرحوم،...   
498  1644768661633396736         مرا به میکده بَر،\n در خُمِ شراب انداز...    
499  1644770853035339780  خواجه چقدر غم‌انگیز فرمود:\n\nنسیمِ زلفِ تو گر...   

    primary_emotion  Anger  Fear  Happiness  Hatred

In [None]:
result_df.head()

Unnamed: 0,local_id,tweet,primary_emotion,Anger,Fear,Happiness,Hatred,Sadness,Wonder
0,973588328225411072,لنگ پولیس ۱ برق شیراز دوبی ۰\nخداییش تیم...,Other,0,0,0,0,0,0
1,983807604982996995,دوستان بارسایی. \nحالا بشینید و فوتبال خوب تما...,Other,0,0,0,0,0,0
2,1378698455305060353,والا ۱۳ روز #عید که هیچی با این شرایط یک سال ا...,Other,0,0,0,0,0,0
3,1380360990793953282,بدبختی ماگناه #بیگانه نبود\nپیوندمن وشما #صمیم...,sadness,1,0,0,0,1,0
4,1382562339258441728,تصور کن انقدر #عشق بدی تاهمه چیز اطرافت شاخ و ...,Other,0,0,0,0,0,0


In [None]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   local_id         500 non-null    int64 
 1   tweet            500 non-null    object
 2   primary_emotion  500 non-null    object
 3   Anger            500 non-null    int64 
 4   Fear             500 non-null    int64 
 5   Happiness        500 non-null    int64 
 6   Hatred           500 non-null    int64 
 7   Sadness          500 non-null    int64 
 8   Wonder           500 non-null    int64 
dtypes: int64(7), object(2)
memory usage: 35.3+ KB


In [None]:
# Replace 'Other' with 'other' in the 'primary_emotion' column
result_df['primary_emotion'] = result_df['primary_emotion'].replace({'Other': 'other'})

result_df.head()

Unnamed: 0,local_id,tweet,primary_emotion,Anger,Fear,Happiness,Hatred,Sadness,Wonder
0,973588328225411072,لنگ پولیس ۱ برق شیراز دوبی ۰\nخداییش تیم...,happiness,0,0,1,0,0,0
1,983807604982996995,دوستان بارسایی. \nحالا بشینید و فوتبال خوب تما...,happiness,0,0,1,0,1,0
2,1378698455305060353,والا ۱۳ روز #عید که هیچی با این شرایط یک سال ا...,happiness,0,0,1,0,1,0
3,1380360990793953282,بدبختی ماگناه #بیگانه نبود\nپیوندمن وشما #صمیم...,sadness,1,0,0,1,1,0
4,1382562339258441728,تصور کن انقدر #عشق بدی تاهمه چیز اطرافت شاخ و ...,happiness,0,0,1,0,1,0


In [None]:
# Rename columns
result_df = result_df.rename(columns={"Anger":'anger', "Sadness":'sadness', "Wonder":'surprise', "Happiness":'happiness', "Fear":'fear', "Hatred":'disgust'})

# Print the reordered DataFrame
result_df.head()


Unnamed: 0,local_id,tweet,primary_emotion,anger,fear,happiness,disgust,sadness,surprise
0,973588328225411072,لنگ پولیس ۱ برق شیراز دوبی ۰\nخداییش تیم...,happiness,0,0,1,0,0,0
1,983807604982996995,دوستان بارسایی. \nحالا بشینید و فوتبال خوب تما...,happiness,0,0,1,0,1,0
2,1378698455305060353,والا ۱۳ روز #عید که هیچی با این شرایط یک سال ا...,happiness,0,0,1,0,1,0
3,1380360990793953282,بدبختی ماگناه #بیگانه نبود\nپیوندمن وشما #صمیم...,sadness,1,0,0,1,1,0
4,1382562339258441728,تصور کن انقدر #عشق بدی تاهمه چیز اطرافت شاخ و ...,happiness,0,0,1,0,1,0


In [None]:
cols = ['local_id', 'tweet', 'primary_emotion', 'anger', 'disgust', 'fear', 'sadness', 'happiness', 'surprise']

In [None]:
# Reorder the columns
result_df1 = result_df[cols]

result_df1.head()

Unnamed: 0,local_id,tweet,primary_emotion,anger,disgust,fear,sadness,happiness,surprise
0,973588328225411072,لنگ پولیس ۱ برق شیراز دوبی ۰\nخداییش تیم...,happiness,0,0,0,0,1,0
1,983807604982996995,دوستان بارسایی. \nحالا بشینید و فوتبال خوب تما...,happiness,0,0,0,1,1,0
2,1378698455305060353,والا ۱۳ روز #عید که هیچی با این شرایط یک سال ا...,happiness,0,0,0,1,1,0
3,1380360990793953282,بدبختی ماگناه #بیگانه نبود\nپیوندمن وشما #صمیم...,sadness,1,1,0,1,0,0
4,1382562339258441728,تصور کن انقدر #عشق بدی تاهمه چیز اطرافت شاخ و ...,happiness,0,0,0,1,1,0


In [None]:
result_df1.to_csv('result_final1.csv')