In [1]:
!pip install transformers[torch]
!pip install optuna

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.8 MB/s

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [24]:
import pandas as pd
from transformers import AutoTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import optuna
import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


In [4]:
test_set = pd.read_csv('drive/MyDrive/BERT Sentiment/CSVs/yelp_test.csv',header=None,names=['sentiment','review'])


In [5]:
test_set.head()

Unnamed: 0,sentiment,review
0,2,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,1,The food is good. Unfortunately the service is...
4,2,Even when we didn't have a car Filene's Baseme...


In [6]:
len(test_set)

38000

In [7]:
test_set = test_set.iloc[:200]

In [8]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
test_inputs = tokenizer(list(test_set['review']),truncation=True,padding=True)

In [10]:
test_inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [11]:
df = pd.DataFrame({'input_ids':test_inputs['input_ids'],'attention_mask':test_inputs['attention_mask'],'labels':test_set['sentiment']-1})


In [12]:
df.head()

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 10043, 2000, 2060, 4391, 1010, 1045, 203...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[101, 2197, 2621, 1045, 2018, 2019, 6098, 2000...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
2,"[101, 5379, 3095, 1010, 2168, 29500, 4189, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,"[101, 1996, 2833, 2003, 2204, 1012, 6854, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,"[101, 2130, 2043, 2057, 2134, 1005, 1056, 2031...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [13]:
X_train, X_test, y_train, y_test = train_test_split(df[['input_ids','attention_mask']], df['labels'], test_size=0.2, random_state=42)


In [14]:
X_train.head()

Unnamed: 0,input_ids,attention_mask
79,"[101, 2023, 2173, 2003, 3100, 1012, 1996, 1561...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
197,"[101, 1045, 2428, 2123, 1005, 1056, 3305, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
38,"[101, 2058, 18098, 6610, 2094, 1010, 23592, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
24,"[101, 1045, 2253, 2045, 2651, 999, 1996, 3013,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
122,"[101, 1045, 2109, 2000, 4965, 2474, 21111, 968...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [15]:
X_train = X_train.iloc[:100]

In [16]:
y_train = y_train.iloc[:100]


In [17]:
train_set = X_train.join(y_train).reset_index()

In [18]:
eval_set = X_test.join(y_test).reset_index()


In [19]:
def model_init(trial):
      # Define hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    gradient_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", 1, 8)
    per_device_train_batch_size = trial.suggest_int("per_device_train_batch_size", 4, 16)
    evaluation_strategy = trial.suggest_categorical("evaluation_strategy", ['steps', 'epoch'])
    per_device_eval_batch_size = trial.suggest_int("per_device_eval_batch_size", 4, 16)
    warmup_steps = trial.suggest_int("warmup_steps", 100, 500)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)

    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)

    return model

In [20]:
class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_dataset = SentimentDataset(train_set, train_set['labels'])
eval_dataset = SentimentDataset(eval_set, eval_set['labels'])


In [33]:
def objective(trial):


    # Define training arguments
    training_args = TrainingArguments(
        output_dir='drive/MyDrive/BERT Sentiment/output',
        seed=42,
        logging_dir='drive/MyDrive/BERT Sentiment/output/logs',
        logging_steps=1000
    )
    print("Defined the training arguments")


    model = model_init(trial)
    print("Initialized the model")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset)

    print("Created the trainer")

    results = trainer.train()
    print("Trained the model")

    predictions = trainer.predict(eval_dataset)
    true_labels = eval_dataset.labels
    predicted_labels = predictions.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='binary')
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1: " + str(f1))

    return f1

In [36]:
study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=5)


[I 2023-10-30 20:28:31,833] A new study created in memory with name: no-name-8f9bdd3b-2157-4c37-80ba-e168752c81e9


Defined the training arguments


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized the model
Created the trainer


Step,Training Loss


Trained the model


[I 2023-10-30 20:32:48,963] Trial 0 finished with value: 0.8846153846153846 and parameters: {'learning_rate': 1.4934956471517905e-05, 'num_train_epochs': 2, 'gradient_accumulation_steps': 1, 'per_device_train_batch_size': 13, 'evaluation_strategy': 'epoch', 'per_device_eval_batch_size': 16, 'warmup_steps': 296, 'weight_decay': 0.016931469156923753}. Best is trial 0 with value: 0.8846153846153846.


Precision: 0.92
Recall: 0.8518518518518519
F1: 0.8846153846153846
Defined the training arguments


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized the model
Created the trainer


Step,Training Loss


Trained the model


[I 2023-10-30 20:37:06,020] Trial 1 finished with value: 0.8846153846153846 and parameters: {'learning_rate': 2.0227833960983334e-05, 'num_train_epochs': 1, 'gradient_accumulation_steps': 2, 'per_device_train_batch_size': 14, 'evaluation_strategy': 'epoch', 'per_device_eval_batch_size': 13, 'warmup_steps': 487, 'weight_decay': 0.02861148803016117}. Best is trial 0 with value: 0.8846153846153846.


Precision: 0.92
Recall: 0.8518518518518519
F1: 0.8846153846153846
Defined the training arguments


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized the model
Created the trainer


Step,Training Loss


Trained the model


[I 2023-10-30 20:41:14,369] Trial 2 finished with value: 0.8846153846153846 and parameters: {'learning_rate': 2.7649411849337046e-05, 'num_train_epochs': 3, 'gradient_accumulation_steps': 8, 'per_device_train_batch_size': 11, 'evaluation_strategy': 'steps', 'per_device_eval_batch_size': 15, 'warmup_steps': 273, 'weight_decay': 0.06174465017157729}. Best is trial 0 with value: 0.8846153846153846.


Precision: 0.92
Recall: 0.8518518518518519
F1: 0.8846153846153846
Defined the training arguments


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized the model
Created the trainer


Step,Training Loss


Trained the model


[I 2023-10-30 20:45:25,222] Trial 3 finished with value: 0.8846153846153846 and parameters: {'learning_rate': 1.2300091695592216e-05, 'num_train_epochs': 2, 'gradient_accumulation_steps': 7, 'per_device_train_batch_size': 15, 'evaluation_strategy': 'steps', 'per_device_eval_batch_size': 12, 'warmup_steps': 211, 'weight_decay': 0.09478886005198046}. Best is trial 0 with value: 0.8846153846153846.


Precision: 0.92
Recall: 0.8518518518518519
F1: 0.8846153846153846
Defined the training arguments


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized the model
Created the trainer


Step,Training Loss


Trained the model


[I 2023-10-30 20:49:33,757] Trial 4 finished with value: 0.8846153846153846 and parameters: {'learning_rate': 3.070052439984483e-05, 'num_train_epochs': 2, 'gradient_accumulation_steps': 4, 'per_device_train_batch_size': 4, 'evaluation_strategy': 'epoch', 'per_device_eval_batch_size': 15, 'warmup_steps': 475, 'weight_decay': 0.012600540641117985}. Best is trial 0 with value: 0.8846153846153846.


Precision: 0.92
Recall: 0.8518518518518519
F1: 0.8846153846153846


In [37]:
best_hyperparameters = study.best_params

print("Best hyperparameters" + str(best_hyperparameters))

Best hyperparameters{'learning_rate': 1.4934956471517905e-05, 'num_train_epochs': 2, 'gradient_accumulation_steps': 1, 'per_device_train_batch_size': 13, 'evaluation_strategy': 'epoch', 'per_device_eval_batch_size': 16, 'warmup_steps': 296, 'weight_decay': 0.016931469156923753}
