# DESCARGA DE DATOS

In [1]:
#Create DatasetDict
from datasets import DatasetDict, Dataset, load_dataset
dataset_path = 'RikoteMaster/isear_augmented'
dataset_dict = load_dataset(dataset_path)

dataset_dict = dataset_dict.remove_columns('Augmented')
dataset_dict 


Found cached dataset parquet (/root/.cache/huggingface/datasets/RikoteMaster___parquet/RikoteMaster--isear_augmented-b6d7bc560c3e10d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['Emotion', 'Text_processed'],
        num_rows: 752
    })
    test: Dataset({
        features: ['Emotion', 'Text_processed'],
        num_rows: 752
    })
    train: Dataset({
        features: ['Emotion', 'Text_processed'],
        num_rows: 10751
    })
})

In [2]:
#charge the train datasetDict as a df
df = dataset_dict['train'].to_pandas()
df.head()
#create id2label and label2id
id2label = {i: label for i, label in enumerate(df['Emotion'].unique())}
label2id = {label: i for i, label in enumerate(df['Emotion'].unique())}

#apply label2id to the datasetDict
dataset_dict = dataset_dict.map(lambda example: {'labels': label2id[example['Emotion']]}, remove_columns=['Emotion'])



Loading cached processed dataset at /root/.cache/huggingface/datasets/RikoteMaster___parquet/RikoteMaster--isear_augmented-b6d7bc560c3e10d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-67b0a7624c7a5d2f.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/RikoteMaster___parquet/RikoteMaster--isear_augmented-b6d7bc560c3e10d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-161de27b49f6fdea.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/RikoteMaster___parquet/RikoteMaster--isear_augmented-b6d7bc560c3e10d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-03e305454b1bb339.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/RikoteMaster___parquet/RikoteMaster--isear_augmented-b6d7bc560c3e10d4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-161de27b49f6fdea.arrow
Loading cached processed dataset at /roo

In [3]:
dataset_dict

DatasetDict({
    validation: Dataset({
        features: ['Text_processed', 'labels'],
        num_rows: 752
    })
    test: Dataset({
        features: ['Text_processed', 'labels'],
        num_rows: 752
    })
    train: Dataset({
        features: ['Text_processed', 'labels'],
        num_rows: 10751
    })
})

### Carga del tokenizador

In [5]:
from transformers import AutoTokenizer
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [6]:
def tokenize_text(examples):
    return tokenizer(examples["Text_processed"], padding="max_length")

In [7]:
dataset_dict = dataset_dict.map(tokenize_text, batched=True)
dataset_dict

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/10751 [00:00<?, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['Text_processed', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 752
    })
    test: Dataset({
        features: ['Text_processed', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 752
    })
    train: Dataset({
        features: ['Text_processed', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10751
    })
})

In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    
    return {'eval_accuracy': acc, 'f1': f1}

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "weight_decay" : trial.suggest_float("weight_decay", 1e-6, 1e-1, log=True),

    }

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def model_init(trial):
    if 'model' in locals():
        del model
        torch.cuda.empty_cache()

    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(id2label), ignore_mismatched_sizes=True).to(device)
    for param in model.base_model.parameters():
        param.requires_grad = False
    
    return model


def compute_objective(metrics):
    
    return metrics['eval_accuracy'] + metrics['f1']


batch_size = 16
epochs = 5

output_dir = './results_searching_hyperparameters'
logging_steps = len(dataset_dict['train']) // batch_size

args = TrainingArguments( 
                        output_dir=output_dir, 
                        num_train_epochs=epochs,
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        evaluation_strategy='epoch',
                        logging_steps=logging_steps,
                        fp16=True,
                        push_to_hub=False,
                        # prevent saving checkpoints
                        save_strategy="steps",
                        save_steps=int(1e9),  # a large number
                    )

trainer = Trainer(
    model=None,
    args=args,
    train_dataset= dataset_dict['train'],
    eval_dataset= dataset_dict['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init=model_init,
)

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=20
)

# Save best_trial to a text file

best_trial_dict = {
    'run_id': best_trial.run_id,
    'objective': best_trial.objective,
    'hyperparameters': best_trial.hyperparameters
}

# Save best_trial_dict to a text file
import json
with open('best_trial_frozen.txt', 'w') as file:
    file.write(json.dumps(best_trial_dict, indent=4))  # indent=4 for pretty printing




Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /usr/local/lib/python3.8/dist-packages/bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-07-17 06:43:52,948] A new study created in memory with name: no-name-6ddbb2ed-b749-4d19-82ce-f10f0f1e1c41
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
!nvidia-smi

Thu Jul 13 11:29:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN V      Off  | 00000000:01:00.0 Off |                  N/A |
| 34%   45C    P8    26W / 250W |  10004MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(id2label), ignore_mismatched_sizes=True)
#freeze all the parameters except the last layer, be sure that you freeze all excepting the last layer
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bhadresh-savani/bert-base-go-emotion and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import EarlyStoppingCallback
batch_size = 16
epochs = 30

output_dir = './results_freezed'
logging_steps = len(dataset_dict['train']) // batch_size
args = TrainingArguments( output_dir=output_dir, 
                        num_train_epochs=epochs,
                        learning_rate=3.562121201880562e-04,
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        weight_decay=0.0007924379520012866,
                        evaluation_strategy='epoch',
                        save_strategy='epoch',
                        logging_steps=logging_steps,
                        fp16=True,
                        push_to_hub=False,
                        load_best_model_at_end=True,
                        metric_for_best_model='accuracy')

In [21]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=dataset_dict['train'],
                  eval_dataset=dataset_dict['validation'],
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer,
                  callbacks = [EarlyStoppingCallback(early_stopping_patience=int(0.2*epochs))])

In [22]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5951,1.531633,0.442819,0.430315
2,1.5875,1.514691,0.432181,0.420767
3,1.5811,1.524175,0.445479,0.441161
4,1.5743,1.530792,0.416223,0.416816
5,1.5738,1.51581,0.445479,0.433519
6,1.5655,1.509822,0.448138,0.438635
7,1.5631,1.502079,0.465426,0.452725
8,1.558,1.511683,0.464096,0.454242
9,1.5462,1.491038,0.462766,0.455032
10,1.5519,1.495883,0.458777,0.445807


TrainOutput(global_step=12096, training_loss=1.5571194822510714, metrics={'train_runtime': 1307.1586, 'train_samples_per_second': 246.741, 'train_steps_per_second': 15.423, 'total_flos': 5.09190110148096e+16, 'train_loss': 1.5571194822510714, 'epoch': 18.0})

In [23]:
#push to hub
trainer.push_to_hub()

/home/mriciba/Projects/dipsy/BERTS/code/BERT/./results_freezed is already a clone of https://huggingface.co/RikoteMaster/results_freezed. Make sure you pull the latest changes with `repo.git_pull()`.


Upload file pytorch_model.bin:   0%|          | 1.00/418M [00:00<?, ?B/s]

Upload file runs/Jul13_12-41-57_2bfb4451df90/events.out.tfevents.1689252118.2bfb4451df90.283443.21:   0%|     …

Upload file training_args.bin:   0%|          | 1.00/3.87k [00:00<?, ?B/s]

To https://huggingface.co/RikoteMaster/results_freezed
   f0a7fe0..3799876  main -> main

To https://huggingface.co/RikoteMaster/results_freezed
   3799876..2563c18  main -> main



'https://huggingface.co/RikoteMaster/results_freezed/commit/37998761a70e46bb59eb1487e7155648e5274438'

In [None]:
#predict on test set
preds = trainer.predict(dataset_dict['test'])
preds = preds.predictions.argmax(-1)
#calculate accuracy
acc = accuracy_score(test_df['Emotion'], preds)

print(acc)

In [None]:
for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = False


In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()