# DESCARGA DE DATOS

In [2]:
import os 
import shutil
import requests
from utils import load_and_preprocess_data

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



##Fichero se encuentra en enlace externo
url = "https://raw.githubusercontent.com/PoorvaRane/Emotion-Detector/master/ISEAR.csv"
output_file = "ISEAR.csv"

destination_folder = "data"
#Si no existe la carpeta en la que queremos guardar los datos debemos crearla
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

response = requests.get(url)
with open(output_file, 'wb') as f:
    f.write(response.content)

shutil.move(output_file, f"{destination_folder}/{output_file}")

# Load and preprocess the dataset
df = load_and_preprocess_data('./data/ISEAR.csv')
df['Emotion'] = df['Emotion'].replace('guit', 'guilt')
df


Unnamed: 0,Emotion,Text,Text_processed
0,joy,On days when I feel close to my partner and ot...,on days when i feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...,every time i imagine that someone i love or i ...
2,anger,When I had been obviously unjustly treated and...,when i had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...,when i think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...,at a gathering i found myself involuntarily si...
...,...,...,...
7511,shame,Two years back someone invited me to be the tu...,two years back someone invited me to be the tu...
7512,shame,I had taken the responsibility to do something...,i had taken the responsibility to do something...
7513,fear,I was at home and I heard a loud sound of spit...,i was at home and i heard a loud sound of spit...
7514,guilt,I did not do the homework that the teacher had...,i did not do the homework that the teacher had...


### Generación de DatasetDict y labels

In [3]:
#Create id2label and label2id dictionaries from df['Emotion']
id2label = dict(enumerate(df['Emotion'].unique()))
label2id = {value: key for key, value in id2label.items()}
print(id2label)
print(label2id)
#apply to df['Emotion']
df['Emotion'] = df['Emotion'].replace(label2id) 
df.drop(['Text'], axis=1, inplace=True)

{0: 'joy', 1: 'fear', 2: 'anger', 3: 'sadness', 4: 'disgust', 5: 'shame', 6: 'guilt'}
{'joy': 0, 'fear': 1, 'anger': 2, 'sadness': 3, 'disgust': 4, 'shame': 5, 'guilt': 6}


### División de los datos en train, test y validación

In [4]:
#Split the dataset intro train validation and test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
#Create DatasetDict
from datasets import DatasetDict, Dataset
dataset_dict = DatasetDict({'train': Dataset.from_pandas(train_df), 'validation': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Emotion', 'Text_processed', '__index_level_0__'],
        num_rows: 4809
    })
    validation: Dataset({
        features: ['Emotion', 'Text_processed', '__index_level_0__'],
        num_rows: 1203
    })
    test: Dataset({
        features: ['Emotion', 'Text_processed', '__index_level_0__'],
        num_rows: 1504
    })
})

### Carga del tokenizador

In [5]:
from transformers import AutoTokenizer
model_ckpt = "bhadresh-savani/bert-base-go-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [6]:
def tokenize_text(examples):
    return tokenizer(examples["Text_processed"], padding="max_length")

In [7]:
#drop __index_level_0__ column
dataset_dict = dataset_dict.remove_columns('__index_level_0__')

dataset_dict = dataset_dict.map(tokenize_text, batched=True)
dataset_dict

Map:   0%|          | 0/4809 [00:00<?, ? examples/s]

Map:   0%|          | 0/1203 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Emotion', 'Text_processed', 'input_ids', 'attention_mask'],
        num_rows: 4809
    })
    validation: Dataset({
        features: ['Emotion', 'Text_processed', 'input_ids', 'attention_mask'],
        num_rows: 1203
    })
    test: Dataset({
        features: ['Emotion', 'Text_processed', 'input_ids', 'attention_mask'],
        num_rows: 1504
    })
})

In [8]:
dataset_dict = dataset_dict.rename_column('Emotion', 'labels')

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    
    return {'eval_accuracy': acc, 'f1': f1}

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "weight_decay" : trial.suggest_float("weight_decay", 1e-6, 1e-1, log=True),

    }

def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(id2label), ignore_mismatched_sizes=True)

def compute_objective(metrics):
    del model
    torch.cuda.empty_cache()
    return metrics['eval_accuracy'] + metrics['f1']


batch_size = 16
epochs = 10

output_dir = './results_freezed'
logging_steps = len(dataset_dict['train']) // batch_size

args = TrainingArguments( output_dir=output_dir, 
                        num_train_epochs=epochs,
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        evaluation_strategy='epoch',
                        save_strategy='epoch',
                        logging_steps=logging_steps,
                        fp16=True,
                        push_to_hub=False,
                        load_best_model_at_end=True,
                        metric_for_best_model='accuracy')

trainer = Trainer(
    model=None,
    args=args,
    train_dataset= dataset_dict['train'],
    eval_dataset= dataset_dict['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init=model_init,
)

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=20
)

#create a compute_objective function




Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /usr/local/lib/python3.8/dist-packages/bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bhadresh-savani/bert-base-go-emotion and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-07-11 13:52:12,706] A new study created in memory with name: no-name-f469c1b2-506f-4d90-8198-8aeda3b4aca1
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bhadresh-savani/bert-base-go-emotion and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([7, 7

Epoch,Training Loss,Validation Loss


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(id2label), ignore_mismatched_sizes=True)
#freeze all the parameters except the last layer, be sure that you freeze all excepting the last layer
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
from transformers import EarlyStoppingCallback
batch_size = 32
epochs = 30

output_dir = './results_freezed'
logging_steps = len(dataset_dict['train']) // batch_size
args = TrainingArguments( output_dir=output_dir, 
                        num_train_epochs=epochs,
                        learning_rate=2e-3,
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        weight_decay=0.01,
                        evaluation_strategy='epoch',
                        save_strategy='epoch',
                        logging_steps=logging_steps,
                        fp16=True,
                        push_to_hub=false,
                        load_best_model_at_end=True,
                        metric_for_best_model='accuracy')

In [None]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=dataset_dict['train'],
                  eval_dataset=dataset_dict['validation'],
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer,
                  callbacks = [EarlyStoppingCallback(early_stopping_patience=int(0.2*epochs))])

In [None]:
trainer.train()

In [None]:
#push to hub
trainer.push_to_hub()

In [None]:
#predict on test set
preds = trainer.predict(dataset_dict['test'])
preds = preds.predictions.argmax(-1)
#calculate accuracy
acc = accuracy_score(test_df['Emotion'], preds)

print(acc)

In [None]:
for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = False


In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()