In [None]:
!pip install transformers[torch]
!pip install optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support
import torch
import optuna

In [None]:
# Use 1/15 of training data for hyperparameter tuning
dev_set = pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_0.parquet')


In [None]:
y_train = pd.read_csv('/content/drive/MyDrive/BERT Sentiment/CSVs/y_train_full.csv')

In [None]:
# Double check that the two datasets match before combining them
dev_set.head()

Unnamed: 0,index,input_ids,attention_mask
0,2522958,"[101, 28844, 2100, 7570, 12868, 8579, 12910, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1160125,"[101, 2307, 4031, 1024, 3819, 4031, 2005, 1280...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,861121,"[101, 12476, 12241, 5017, 1010, 10223, 6508, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,300957,"[101, 2821, 1010, 1056, 1012, 1045, 1012, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1610389,"[101, 2023, 2003, 2028, 1997, 2026, 5440, 5691...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
y_train.head()

Unnamed: 0.1,Unnamed: 0,sentiment
0,2522958,1
1,1160125,2
2,861121,1
3,300957,1
4,1610389,2


In [None]:
len(dev_set)

278857

In [None]:
len(y_train)

4182850

In [None]:
dev_set.tail()

Unnamed: 0,index,input_ids,attention_mask
278852,2164167,"[101, 2088, 1005, 1055, 5409, 10430, 24795, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
278853,2624737,"[101, 17634, 2022, 8059, 1024, 2023, 2793, 251...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
278854,1471486,"[101, 2502, 10520, 1024, 1045, 2031, 2035, 942...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
278855,923048,"[101, 16334, 4301, 1024, 3819, 2338, 2005, 221...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
278856,2732294,"[101, 4074, 2012, 2014, 2190, 1024, 3752, 2014...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
y_train.iloc[len(dev_set)-5:len(dev_set)]

Unnamed: 0.1,Unnamed: 0,sentiment
278852,2164167,1
278853,2624737,1
278854,1471486,1
278855,923048,2
278856,2732294,2


In [None]:

dev_set = dev_set.join(y_train, how='inner')
len(dev_set)

278857

In [None]:
# Inspect combined dataset
dev_set.tail()

Unnamed: 0.1,index,input_ids,attention_mask,Unnamed: 0,sentiment
278852,2164167,"[101, 2088, 1005, 1055, 5409, 10430, 24795, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2164167,1
278853,2624737,"[101, 17634, 2022, 8059, 1024, 2023, 2793, 251...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2624737,1
278854,1471486,"[101, 2502, 10520, 1024, 1045, 2031, 2035, 942...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1471486,1
278855,923048,"[101, 16334, 4301, 1024, 3819, 2338, 2005, 221...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",923048,2
278856,2732294,"[101, 4074, 2012, 2014, 2190, 1024, 3752, 2014...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2732294,2


In [None]:
# Drop duplicate column and set index
dev_set = dev_set.drop(columns=['Unnamed: 0','index'])


In [None]:
# Get dev set into expected format for model
dev_set.columns = ['input_ids','attention_mask','labels']
# Binary classification expects 0 and 1, not 1 and 2
dev_set['labels'] = dev_set['labels']-1
dev_set.head()

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 28844, 2100, 7570, 12868, 8579, 12910, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"[101, 2307, 4031, 1024, 3819, 4031, 2005, 1280...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[101, 12476, 12241, 5017, 1010, 10223, 6508, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
3,"[101, 2821, 1010, 1056, 1012, 1045, 1012, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,"[101, 2023, 2003, 2028, 1997, 2026, 5440, 5691...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [None]:
eval_set = dev_set.iloc[:len(dev_set)//10].reset_index(drop=True)
train_set = dev_set.iloc[len(dev_set)//10:].reset_index(drop=True)
train_set.head()



Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 1037, 3803, 17070, 1012, 1024, 2296, 231...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[101, 2307, 2326, 1998, 3835, 2111, 1012, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[101, 1996, 2326, 2001, 2307, 1012, 1996, 2833...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,"[101, 2077, 1045, 2288, 2026, 2047, 25983, 146...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,"[101, 3083, 3319, 2025, 2013, 1037, 2155, 2266...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [None]:
# Use a more reasonably-sized fine-tuning dataset
eval_set = eval_set.iloc[:10000]
train_set = train_set.iloc[:1500]


In [None]:
eval_set.head()

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 28844, 2100, 7570, 12868, 8579, 12910, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"[101, 2307, 4031, 1024, 3819, 4031, 2005, 1280...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[101, 12476, 12241, 5017, 1010, 10223, 6508, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
3,"[101, 2821, 1010, 1056, 1012, 1045, 1012, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,"[101, 2023, 2003, 2028, 1997, 2026, 5440, 5691...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [None]:
def model_init(trial):
      # Define hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    gradient_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", 1, 8)
    per_device_train_batch_size = trial.suggest_int("per_device_train_batch_size", 4, 16)
    evaluation_strategy = trial.suggest_categorical("evaluation_strategy", ['steps', 'epoch'])
    per_device_eval_batch_size = trial.suggest_int("per_device_eval_batch_size", 4, 16)
    warmup_steps = trial.suggest_int("warmup_steps", 100, 500)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)

    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)

    return model




In [None]:
class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

  def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

  def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = SentimentDataset(train_set, train_set['labels'])
eval_dataset = SentimentDataset(eval_set, eval_set['labels'])


In [None]:
len(train_dataset)

1500

In [None]:
def objective(trial):

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='drive/MyDrive/BERT Sentiment/output',
        logging_dir='drive/MyDrive/BERT Sentiment/output/logs',
        logging_steps=1000
    )

    # Initialize model
    model = model_init(trial)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset)

    trainer.train()


    # Evaluate model
    predictions = trainer.predict(eval_dataset)
    true_labels = eval_dataset.labels
    predicted_labels = predictions.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='binary')
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1: " + str(f1))

    return f1

In [None]:
study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=8,n_jobs=6)

In [None]:
best_hyperparameters = study.best_params

print("Best hyperparameters" + str(best_hyperparameters))

Best hyperparameters{'learning_rate': 4.122342215733177e-05, 'num_train_epochs': 1, 'gradient_accumulation_steps': 2, 'per_device_train_batch_size': 12, 'evaluation_strategy': 'epoch', 'per_device_eval_batch_size': 5, 'warmup_steps': 391, 'weight_decay': 0.08139944860406301}


In [None]:
print(study.best_trial)


FrozenTrial(number=3, state=TrialState.COMPLETE, values=[0.9213952101758919], datetime_start=datetime.datetime(2023, 11, 1, 2, 55, 36, 800778), datetime_complete=datetime.datetime(2023, 11, 1, 3, 3, 54, 928101), params={'learning_rate': 4.122342215733177e-05, 'num_train_epochs': 1, 'gradient_accumulation_steps': 2, 'per_device_train_batch_size': 12, 'evaluation_strategy': 'epoch', 'per_device_eval_batch_size': 5, 'warmup_steps': 391, 'weight_decay': 0.08139944860406301}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'num_train_epochs': IntDistribution(high=3, log=False, low=1, step=1), 'gradient_accumulation_steps': IntDistribution(high=8, log=False, low=1, step=1), 'per_device_train_batch_size': IntDistribution(high=16, log=False, low=4, step=1), 'evaluation_strategy': CategoricalDistribution(choices=('steps', 'epoch')), 'per_device_eval_batch_size': IntDistribution(high=16, log=Fa

In [None]:
import pickle

with open('/content/drive/MyDrive/BERT Sentiment/output/best_hyperparameters.pkl', 'wb') as outfile:
    pickle.dump(best_hyperparameters, outfile)
