In [192]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import nltk

from datasets import load_dataset
emotions = load_dataset ("go_emotions", "simplified")
depression = load_dataset("ShreyaR/DepressionDetection")
emotions.set_format(type="pandas")
depression.set_format(type="pandas")
df1 = emotions["train"][:]
df2 = depression["train"][:]
df1.drop(columns=['id'], inplace=True) #dropping irrelevant coloums, and also some emotions as this dataset has far too many
df1['labels'] = df1['labels'].apply(lambda x: x[0])
df2.rename(columns={'is_depression': 'depression', "clean_text": "text"}, inplace=True)
df2 = df2[df2['depression'] != 0] #removing all rows where depression is 0 as all other emotions data is missing
merged_df = pd.concat([df1, df2], ignore_index=True)
merged_df = merged_df.fillna(0)
merged_df.loc[merged_df['depression'] == 1, 'labels'] = 28
merged_df.drop(columns=['depression'], inplace=True) #dropping depression column as its now duplicate data
merged_df.rename(columns={'labels': 'label'}, inplace=True)
merged_df['label'] = merged_df['label'].astype('int64')
merged_df.to_csv('combined_data.csv', index=False)
merged_df.head()

In [None]:
debug=load_dataset("emotion")
debug.set_format(type="pandas")
dfd= debug["train"][:]
dfd.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [None]:
dfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [None]:
merged_df.head()

Unnamed: 0,text,label
0,My favourite food is anything I didn't have to...,27
1,"Now if he does off himself, everyone will thin...",27
2,WHY THE FUCK IS BAYLESS ISOING,2
3,To make her feel threatened,14
4,Dirty Southern Wankers,3


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47241 entries, 0 to 47240
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    47241 non-null  object
 1   label   47241 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 738.3+ KB


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_valid_df = train_test_split(merged_df, test_size=0.2, random_state=42)
test_df, valid_df = train_test_split(test_valid_df, test_size=0.5, random_state=42)

from datasets import DatasetDict
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)
ds_splits = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})
ds_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 37792
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4725
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4724
    })
})

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenizer_function(batch):
    return tokenizer(batch["text"], padding= True, truncation= True)

In [None]:
tokenized_data= ds_splits.map(tokenizer_function,
                             batched= True,
                             batch_size= None)


  0%|          | 0/591 [04:18<?, ?it/s]

Map: 100%|██████████| 37792/37792 [00:05<00:00, 6320.77 examples/s]
Map: 100%|██████████| 4725/4725 [00:00<00:00, 7829.42 examples/s]
Map: 100%|██████████| 4724/4724 [00:00<00:00, 7723.48 examples/s]


In [None]:
print(tokenized_data.column_names)

{'train': ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'], 'validation': ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'], 'test': ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask']}


In [None]:
from transformers import AutoModelForSequenceClassification
import torch
checkpoint = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = (AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 29).to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds=pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc =accuracy_score(labels, preds)
    return {"accureacy": acc, "f1": f1}

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir= "finetuning-emotion-model",
    num_train_epochs= 1,
    learning_rate= 2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    push_to_hub=False
)

from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics= compute_metrics,
    train_dataset= tokenized_data["train"],
    eval_dataset= tokenized_data["validation"],
    tokenizer= tokenizer
)

trainer.train()

  0%|          | 0/591 [03:05<?, ?it/s]


  0%|          | 0/591 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  7%|▋         | 41/591 [13:07<2:52:13, 18.79s/it]

KeyboardInterrupt: 