In [1]:
import pandas as pd
import numpy as np

files = ['goemotions_1.csv', 'goemotions_2.csv', 'goemotions_3.csv']

dfs = [pd.read_csv(file) for file in files]

df_text = pd.concat(dfs, ignore_index=True)
df_words = pd.read_csv('emotion_words.csv')

emotion_columns = df_text.columns[df_text.columns.get_loc("admiration"):].tolist()

df_text["dominant_emotion"] = df_text[emotion_columns].idxmax(axis=1)

df_oryginal = df_text[['text', 'dominant_emotion']]

In [2]:
df_oryginal

Unnamed: 0,text,dominant_emotion
0,That game hurt.,sadness
1,>sexuality shouldn’t be a grouping category I...,admiration
2,"You do right, if you don't care then fuck 'em!",neutral
3,Man I love reddit.,love
4,"[NAME] was nowhere near them, he was by the Fa...",neutral
...,...,...
211220,Everyone likes [NAME].,love
211221,Well when you’ve imported about a gazillion of...,caring
211222,That looks amazing,admiration
211223,The FDA has plenty to criticize. But like here...,anger


In [3]:
import re
import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    return " ".join(words)

df_oryginal['text'] = df_oryginal['text'].apply(preprocess_text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ziolek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ziolek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oryginal['text'] = df_oryginal['text'].apply(preprocess_text)


In [4]:
df_oryginal

Unnamed: 0,text,dominant_emotion
0,game hurt,sadness
1,sexuality grouping category make different oth...,admiration
2,right care fuck,neutral
3,man love reddit,love
4,name nowhere near falcon,neutral
...,...,...
211220,everyone like name,love
211221,well imported gazillion country get serious,caring
211222,look amazing,admiration
211223,fda plenty criticize like usually criticized h...,anger


In [5]:
df = df_oryginal.copy()
df = df[:100] # tylko 100 rekordów dla szybkości trenowania, oversampling i tak sztucznie zwiększy zbiór

In [6]:
df['dominant_emotion'].value_counts()

dominant_emotion
neutral           21
admiration        11
disapproval        7
annoyance          6
confusion          6
curiosity          6
gratitude          5
caring             4
joy                4
disappointment     4
amusement          4
love               4
anger              3
remorse            3
approval           2
embarrassment      2
optimism           2
surprise           1
sadness            1
disgust            1
excitement         1
realization        1
grief              1
Name: count, dtype: int64

In [7]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy='not majority', random_state=42)
X_resampled, y_resampled = ros.fit_resample(df[['text']], df['dominant_emotion'])
df_balanced = pd.DataFrame({'text': X_resampled.squeeze(), 'dominant_emotion': y_resampled})

In [8]:
df_balanced['dominant_emotion'].value_counts()

dominant_emotion
sadness           21
curiosity         21
anger             21
surprise          21
embarrassment     21
approval          21
joy               21
remorse           21
disgust           21
caring            21
excitement        21
optimism          21
admiration        21
confusion         21
annoyance         21
realization       21
disappointment    21
amusement         21
disapproval       21
gratitude         21
love              21
neutral           21
grief             21
Name: count, dtype: int64

In [9]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df_train = df_balanced.copy()
le = LabelEncoder()
df_train['label'] = le.fit_transform(df_balanced['dominant_emotion'])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_texts, test_texts, train_labels, test_labels, indices_train, indices_test = train_test_split(
    df_train['text'], df_train['label'], df_train['text'].index, test_size=0.2, random_state=42
)
train_dataset = EmotionDataset(train_texts.tolist(), train_labels.tolist())
test_dataset = EmotionDataset(test_texts.tolist(), test_labels.tolist())

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

results = trainer.evaluate()
print(results)

  from .autonotebook import tqdm as notebook_tqdm





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.601949
2,No log,2.112718
3,No log,1.938859


{'eval_loss': 1.938858985900879, 'eval_runtime': 2.9562, 'eval_samples_per_second': 32.813, 'eval_steps_per_second': 2.368, 'epoch': 3.0}


In [10]:
from sklearn.metrics import accuracy_score

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=1)
y_true = test_labels.tolist()

accuracy = accuracy_score(y_true, y_pred)
print(f"Dokładność modelu: {accuracy:.4f}")

Dokładność modelu: 0.8041


In [24]:
labels_dict = labels.set_index('label')['dominant_emotion'].to_dict()

results_df = pd.DataFrame({
    'actual' : y_true,
    'predict': y_pred
})

results_df['actual_emotion'] = results_df['actual'].map(labels_dict)
results_df['predict_emotion'] = results_df['predict'].map(labels_dict)

In [26]:
results_df

Unnamed: 0,actual,predict,actual_emotion,predict_emotion
0,19,19,realization,realization
1,22,22,surprise,surprise
2,5,5,caring,caring
3,21,21,sadness,sadness
4,12,12,excitement,excitement
...,...,...,...,...
92,7,7,curiosity,curiosity
93,18,18,optimism,optimism
94,13,13,gratitude,gratitude
95,14,14,grief,grief
