In [1]:
!pip install transformers[torch]
!pip install optuna



In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import optuna

In [4]:
test_set = pd.read_csv('drive/MyDrive/BERT Sentiment/CSVs/yelp_test.csv',header=None,names=['sentiment','review'])


In [5]:
test_set.head()

Unnamed: 0,sentiment,review
0,2,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,1,The food is good. Unfortunately the service is...
4,2,Even when we didn't have a car Filene's Baseme...


In [6]:
len(test_set)

38000

In [7]:
test_set = test_set.iloc[:200]

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
test_inputs = tokenizer(list(test_set['review']),truncation=True,padding=True)

In [11]:
test_inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [12]:
df = pd.DataFrame({'input_ids':test_inputs['input_ids'],'token_type_ids':test_inputs['token_type_ids'],'attention_mask':test_inputs['attention_mask'],'labels':test_set['sentiment']})


In [13]:
df.head()

Unnamed: 0,input_ids,token_type_ids,attention_mask,labels
0,"[101, 10043, 2000, 2060, 4391, 1010, 1045, 203...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
1,"[101, 2197, 2621, 1045, 2018, 2019, 6098, 2000...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[101, 5379, 3095, 1010, 2168, 29500, 4189, 201...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[101, 1996, 2833, 2003, 2204, 1012, 6854, 1996...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,"[101, 2130, 2043, 2057, 2134, 1005, 1056, 2031...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


In [15]:
X_train, X_test, y_train, y_test = train_test_split(df[['input_ids','token_type_ids','attention_mask']], df['labels'], test_size=0.2, random_state=42)


In [16]:
X_train.head()

Unnamed: 0,input_ids,token_type_ids,attention_mask
79,"[101, 2023, 2173, 2003, 3100, 1012, 1996, 1561...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
197,"[101, 1045, 2428, 2123, 1005, 1056, 3305, 1996...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
38,"[101, 2058, 18098, 6610, 2094, 1010, 23592, 19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
24,"[101, 1045, 2253, 2045, 2651, 999, 1996, 3013,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
122,"[101, 1045, 2109, 2000, 4965, 2474, 21111, 968...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [20]:
train_set = X_train.join(y_train)

In [23]:
eval_set = X_test.join(y_test)


In [24]:
def model_init(trial):
      # Define hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    gradient_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", 1, 8)
    per_device_train_batch_size = trial.suggest_int("per_device_train_batch_size", 4, 16)
    evaluation_strategy = trial.suggest_categorical("evaluation_strategy", ['steps', 'epoch'])
    per_device_eval_batch_size = trial.suggest_int("per_device_eval_batch_size", 4, 16)
    warmup_steps = trial.suggest_int("warmup_steps", 100, 500)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)

    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)

    return model

In [31]:
def objective(trial):


    # Define training arguments
    training_args = TrainingArguments(
        output_dir='drive/MyDrive/BERT Sentiment/output',
        seed=42,
        logging_dir='drive/MyDrive/BERT Sentiment/output/logs',
        logging_steps=1000
    )
    print("Defined the training arguments")


    model = model_init(trial)
    print("Initialized the model")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=eval_set)

    print("Created the trainer")

    trainer.train()
    print("Trained the model")

    results = trainer.hyperparameter_search(model=None, direction='maximize',args=training_args,model_init=model_init)
    print(results.metrics['f1'])

In [32]:
study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=1)


[I 2023-10-30 18:29:25,473] A new study created in memory with name: no-name-7533df30-1dbb-4a66-bfcf-253e4ceb9b82


Defined the training arguments


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized the model
Created the trainer


[W 2023-10-30 18:29:29,569] Trial 0 failed with parameters: {'learning_rate': 2.751838820336916e-05, 'num_train_epochs': 1, 'gradient_accumulation_steps': 1, 'per_device_train_batch_size': 10, 'evaluation_strategy': 'steps', 'per_device_eval_batch_size': 9, 'warmup_steps': 165, 'weight_decay': 0.01365394815718346} because of the following error: KeyError(128).
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 128

The above exception was the direct cause of t

KeyError: ignored