In [1]:
import os 
import shutil
import requests
from utils import load_and_preprocess_data

##Fichero se encuentra en enlace externo
url = "https://raw.githubusercontent.com/PoorvaRane/Emotion-Detector/master/ISEAR.csv"
output_file = "ISEAR.csv"

destination_folder = "data"
#Si no existe la carpeta en la que queremos guardar los datos debemos crearla
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

response = requests.get(url)
with open(output_file, 'wb') as f:
    f.write(response.content)

shutil.move(output_file, f"{destination_folder}/{output_file}")

# Load and preprocess the dataset
df = load_and_preprocess_data('./data/ISEAR.csv')
df['Emotion'] = df['Emotion'].replace('guit', 'guilt')
df




Unnamed: 0,Emotion,Text,Text_processed
0,joy,On days when I feel close to my partner and ot...,on days when i feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...,every time i imagine that someone i love or i ...
2,anger,When I had been obviously unjustly treated and...,when i had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...,when i think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...,at a gathering i found myself involuntarily si...
...,...,...,...
7511,shame,Two years back someone invited me to be the tu...,two years back someone invited me to be the tu...
7512,shame,I had taken the responsibility to do something...,i had taken the responsibility to do something...
7513,fear,I was at home and I heard a loud sound of spit...,i was at home and i heard a loud sound of spit...
7514,guilt,I did not do the homework that the teacher had...,i did not do the homework that the teacher had...


In [2]:
#Create id2label and label2id dictionaries from df['Emotion']
id2label = dict(enumerate(df['Emotion'].unique()))
label2id = {value: key for key, value in id2label.items()}
print(id2label)
print(label2id)
#apply to df['Emotion']
df['Emotion'] = df['Emotion'].replace(label2id) 
df.drop(['Text'], axis=1, inplace=True)

{0: 'joy', 1: 'fear', 2: 'anger', 3: 'sadness', 4: 'disgust', 5: 'shame', 6: 'guilt'}
{'joy': 0, 'fear': 1, 'anger': 2, 'sadness': 3, 'disgust': 4, 'shame': 5, 'guilt': 6}


In [3]:
#Split the dataset intro train validation and test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
#Create DatasetDict
from datasets import DatasetDict, Dataset
dataset_dict = DatasetDict({'train': Dataset.from_pandas(train_df), 'validation': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Emotion', 'Text_processed', '__index_level_0__'],
        num_rows: 4809
    })
    validation: Dataset({
        features: ['Emotion', 'Text_processed', '__index_level_0__'],
        num_rows: 1203
    })
    test: Dataset({
        features: ['Emotion', 'Text_processed', '__index_level_0__'],
        num_rows: 1504
    })
})

In [4]:
from transformers import AutoTokenizer
model_ckpt = "bhadresh-savani/bert-base-go-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [5]:
def tokenize_text(examples):
    return tokenizer(examples["Text_processed"], padding="max_length")

In [6]:
#drop __index_level_0__ column
dataset_dict = dataset_dict.remove_columns('__index_level_0__')

dataset_dict = dataset_dict.map(tokenize_text, batched=True)
dataset_dict

Map:   0%|          | 0/4809 [00:00<?, ? examples/s]

Map:   0%|          | 0/1203 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Emotion', 'Text_processed', 'input_ids', 'attention_mask'],
        num_rows: 4809
    })
    validation: Dataset({
        features: ['Emotion', 'Text_processed', 'input_ids', 'attention_mask'],
        num_rows: 1203
    })
    test: Dataset({
        features: ['Emotion', 'Text_processed', 'input_ids', 'attention_mask'],
        num_rows: 1504
    })
})

In [7]:
dataset_dict = dataset_dict.rename_column('Emotion', 'labels')

In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(id2label), ignore_mismatched_sizes=True)
model


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /usr/local/lib/python3.8/dist-packages/bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bhadresh-savani/bert-base-go-emotion and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [9]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

In [10]:
batch_size = 16
output_dir = './results'
logging_steps = len(dataset_dict['train']) // batch_size
args = TrainingArguments(output_dir=output_dir, 
                        num_train_epochs=10,
                        learning_rate=2e-5,
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        weight_decay=0.01,
                        evaluation_strategy='epoch',
                        save_strategy='epoch',
                        logging_steps=logging_steps,
                        fp16=True,
                        push_to_hub=True,
                        load_best_model_at_end=True
                        )

In [11]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=dataset_dict['train'],
                  eval_dataset=dataset_dict['validation'],
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

/home/mriciba/Projects/dipsy/BERTS/code/BERT/./results is already a clone of https://huggingface.co/RikoteMaster/results. Make sure you pull the latest changes with `repo.git_pull()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [12]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5391,1.343986,0.497922,0.504783
2,1.1815,1.272124,0.543641,0.54315


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Jul 11 08:40:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN V      Off  | 00000000:01:00.0 Off |                  N/A |
| 32%   48C    P2    42W / 250W |  11966MiB / 12288MiB |      0%      Default |
|                               |            