In [1]:
!pip install transformers[torch]
!pip install evaluate
!pip install optuna
!pip install accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.5 MB/s

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
import pandas as pd
import dask.dataframe as dd
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import pyarrow as pa
import numpy as np
import evaluate
import optuna

In [37]:
# Use 1/15 of training data for hyperparameter tuning
dev_set = pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_0.parquet')


In [38]:
y_train = pd.read_csv('/content/drive/MyDrive/BERT Sentiment/CSVs/y_train_full.csv')

In [39]:
# Double check that the two datasets match before combining them
dev_set.head()

Unnamed: 0,index,input_ids,attention_mask
0,2522958,"[101, 28844, 2100, 7570, 12868, 8579, 12910, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1160125,"[101, 2307, 4031, 1024, 3819, 4031, 2005, 1280...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,861121,"[101, 12476, 12241, 5017, 1010, 10223, 6508, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,300957,"[101, 2821, 1010, 1056, 1012, 1045, 1012, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1610389,"[101, 2023, 2003, 2028, 1997, 2026, 5440, 5691...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [40]:
y_train.head()

Unnamed: 0.1,Unnamed: 0,sentiment
0,2522958,1
1,1160125,2
2,861121,1
3,300957,1
4,1610389,2


In [41]:
len(dev_set)

278857

In [42]:
len(y_train)

4182850

In [43]:
dev_set.tail()

Unnamed: 0,index,input_ids,attention_mask
278852,2164167,"[101, 2088, 1005, 1055, 5409, 10430, 24795, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
278853,2624737,"[101, 17634, 2022, 8059, 1024, 2023, 2793, 251...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
278854,1471486,"[101, 2502, 10520, 1024, 1045, 2031, 2035, 942...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
278855,923048,"[101, 16334, 4301, 1024, 3819, 2338, 2005, 221...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
278856,2732294,"[101, 4074, 2012, 2014, 2190, 1024, 3752, 2014...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [44]:
y_train.iloc[len(dev_set)-5:len(dev_set)]

Unnamed: 0.1,Unnamed: 0,sentiment
278852,2164167,1
278853,2624737,1
278854,1471486,1
278855,923048,2
278856,2732294,2


In [45]:
dev_set = dev_set.join(y_train, how='inner')
len(dev_set)

278857

In [46]:
# Inspect combined dataset
dev_set.tail()

Unnamed: 0.1,index,input_ids,attention_mask,Unnamed: 0,sentiment
278852,2164167,"[101, 2088, 1005, 1055, 5409, 10430, 24795, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2164167,1
278853,2624737,"[101, 17634, 2022, 8059, 1024, 2023, 2793, 251...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2624737,1
278854,1471486,"[101, 2502, 10520, 1024, 1045, 2031, 2035, 942...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1471486,1
278855,923048,"[101, 16334, 4301, 1024, 3819, 2338, 2005, 221...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",923048,2
278856,2732294,"[101, 4074, 2012, 2014, 2190, 1024, 3752, 2014...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2732294,2


In [47]:
# Drop duplicate column and set index
dev_set = dev_set.drop(columns=['Unnamed: 0']).set_index('index')


In [48]:
dev_set.columns = ['input_ids','attention_mask','labels']
dev_set.head()

Unnamed: 0_level_0,input_ids,attention_mask,labels
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2522958,"[101, 28844, 2100, 7570, 12868, 8579, 12910, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1160125,"[101, 2307, 4031, 1024, 3819, 4031, 2005, 1280...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
861121,"[101, 12476, 12241, 5017, 1010, 10223, 6508, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
300957,"[101, 2821, 1010, 1056, 1012, 1045, 1012, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1610389,"[101, 2023, 2003, 2028, 1997, 2026, 5440, 5691...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


In [49]:
eval_set = dev_set.iloc[:len(dev_set)//10].reset_index(drop=True)
train_set = dev_set.iloc[len(dev_set)//10:].reset_index(drop=True)
train_set.head()



Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 1037, 3803, 17070, 1012, 1024, 2296, 231...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
1,"[101, 2307, 2326, 1998, 3835, 2111, 1012, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
2,"[101, 1996, 2326, 2001, 2307, 1012, 1996, 2833...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[101, 2077, 1045, 2288, 2026, 2047, 25983, 146...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,"[101, 3083, 3319, 2025, 2013, 1037, 2155, 2266...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [17]:
def model_init(trial):
      # Define hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    gradient_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", 1, 8)
    per_device_train_batch_size = trial.suggest_int("per_device_train_batch_size", 4, 16)
    evaluation_strategy = trial.suggest_categorical("evaluation_strategy", ['steps', 'epoch'])
    per_device_eval_batch_size = trial.suggest_int("per_device_eval_batch_size", 4, 16)
    warmup_steps = trial.suggest_int("warmup_steps", 100, 500)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)

    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)

    return model




In [18]:
def objective(trial):


    # Define training arguments
    training_args = TrainingArguments(
        output_dir='drive/MyDrive/BERT Sentiment/output',
        seed=42,
        logging_dir='drive/MyDrive/BERT Sentiment/output/logs',
        logging_steps=1000
    )
    print("Defined the training arguments")


    model = model_init(trial)
    print("Initialized the model")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=eval_set)

    print("Created the trainer")

    trainer.train()
    print("Trained the model")

    results = trainer.hyperparameter_search(model=None, direction='maximize',args=training_args,model_init=model_init)
    print(results.metrics['f1'])









In [32]:
study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=1)
best_hyperparameters = study.best_params

print("Best hyperparameters" + str(best_hyperparameters))








[I 2023-10-28 05:20:15,502] A new study created in memory with name: no-name-681288ba-c1a7-4d00-bd14-ccb4fad8cdac


Defined the training arguments


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized the model
Created the trainer


[W 2023-10-28 05:20:16,477] Trial 0 failed with parameters: {'learning_rate': 3.948249070738038e-05, 'num_train_epochs': 3, 'gradient_accumulation_steps': 2, 'per_device_train_batch_size': 7, 'evaluation_strategy': 'steps', 'per_device_eval_batch_size': 6, 'warmup_steps': 333, 'weight_decay': 0.048816647569152063} because of the following error: KeyError(142224).
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 142224

The above exception was the direct caus

KeyError: ignored