In [None]:
!pip install transformers

# Import some libraries

In [None]:
import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# from unidecode import unidecode
import re
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.metrics
import seaborn as sns
import random
import sys

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Data loading

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train_df = pd.read_csv("/kaggle/input/ys19-2023-assignment-4a/train_set.csv")
test_df = pd.read_csv("/kaggle/input/ys19-2023-assignment-4a/test_set.csv")
valid_df = pd.read_csv("/kaggle/input/ys19-2023-assignment-4a/valid_set.csv")

train_df['Label_Map'] = train_df['Sentiment'].map({
    'NEGATIVE' : 0, 'NEUTRAL' : 1, 'POSITIVE' : 2
})
valid_df['Label_Map'] = valid_df['Sentiment'].map({
    'NEGATIVE' : 0, 'NEUTRAL' : 1, 'POSITIVE' : 2
})

# Data preprocessing

In [None]:
import unicodedata

def strip_accents_and_lowercase(s):

    # Split the string into words
    words = s.split()

    # Remove the word "http" from the list of words
    words_without_http = [word for word in words if not (word.startswith('http') or  word.startswith('@'))]

    # Join the words back into a string
    output_string = ' '.join(words_without_http)

    return ''.join(c for c in unicodedata.normalize('NFD', output_string)
                  if unicodedata.category(c) != 'Mn').lower()

In [None]:
import os

print("Preproccesing data")
train_df['processed_text'] = train_df['Text'].apply(strip_accents_and_lowercase)
valid_df['processed_text'] = valid_df['Text'].apply(strip_accents_and_lowercase)
test_df['processed_text'] = test_df['Text'].apply(strip_accents_and_lowercase)

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Write the DataFrame to a JSON file
train_text = train_df['processed_text']
valid_text = valid_df['processed_text']
test_text = test_df['processed_text']

print("TRAIN")
print(len(train_text))
print("VALID")
print(len(valid_text))
print("TEST")
print(len(test_text))

In [None]:
print(train_text.head())

# Gpu

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Import tokenizers

In [None]:
from transformers import AutoTokenizer, AutoModel
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
tokenizerDistil = AutoTokenizer.from_pretrained("EftychiaKarav/DistilGREEK-BERT")

I didn't find any different between tokenizers, for this reason I will use tokenizer

In [None]:
# I am checking if tokenizers give different results

def are_lists_equal(list1, list2):
    if len(list1) != len(list2):
        return False
    for token1, token2 in zip(list1, list2):
        if token1 != token2:
            return False
    return True

for i in range(len(tokenizer.tokenize(train_text[0]))):
  if not are_lists_equal(tokenizer.tokenize(train_text[i]),tokenizerDistil.tokenize(train_text[i])):
    print('Not equal')
    break
print('Equal')

In [None]:
# Print the original sentence.
print(' Original: ', train_text[4])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_text[4]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[4])))

# Find max sequence

In [None]:
input_ids = []
for sentence in train_text:
    # Tokenize the sentence
    encoded_dict = tokenizer.encode_plus( sentence, add_special_tokens = True)
    input_ids.append(encoded_dict)
for sentence in valid_text:
    # Tokenize the sentence
    encoded_dict = tokenizer.encode_plus( sentence, add_special_tokens = True)
    input_ids.append(encoded_dict)

In [None]:
max = 0
for inputs in input_ids:
    if len(inputs['input_ids']) > max:
        max = len(inputs['input_ids'])
print(max)

# Make tensors using padding

In [None]:
def tokenization(sentences,labels):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,
                            add_special_tokens = True,
                            max_length = 150,     # I chose a number a bit bigger than 137 of max sequence
                            padding = 'max_length',
                            return_attention_mask = True,
                            return_tensors = 'pt',
                       )

        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    
    return input_ids, attention_masks,labels 

# Tokinization

In [None]:
train_input_ids,train_attention_masks,train_labels = tokenization(train_text,train_df['Label_Map'])
valid_input_ids,valid_attention_masks,valid_labels = tokenization(valid_text,valid_df['Label_Map'])

# Print input

In [None]:
print(train_input_ids.shape)
print(train_attention_masks.shape)
print(train_labels.shape)
print(valid_input_ids.shape)
print(valid_attention_masks.shape)
print(valid_labels.shape)

# Create Data Loaders

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig,DistilBertForSequenceClassification
from transformers import get_linear_schedule_with_warmup

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
valid_dataset = TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)

batch_size = 32

train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset),batch_size = batch_size)

validation_dataloader = DataLoader(valid_dataset, sampler = SequentialSampler(valid_dataset), batch_size = batch_size )

# Usefull functions for acuracy and timing

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Roc curve


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

def plot_roc_curve(y_true, probabilities):
    # Assuming probabilities is a list of numpy arrays
    probabilities = np.vstack(probabilities)
    # probabilities = np.vstack([p.cpu().numpy() for p in probabilities])

    # Binarize the labels
    y_true_bin = label_binarize(y_true, classes=[0, 1, 2])

    num_classes = probabilities.shape[1]

    plt.figure(figsize=(8, 8))  # Adjust the figure size as needed

    for i in range(num_classes):
        fpr, tpr, _ = roc_curve(y_true_bin[:, i], probabilities[:, i])
        roc_auc = auc(fpr, tpr)

        plt.plot(fpr, tpr, lw=2, label=f'Class {i} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

# Training and evaluation function

In [None]:
import random
import numpy as np
from sklearn.metrics import f1_score
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix

def training(model,epochs,scheduler,train_dataloader,validation_dataloader,total_steps,optimizer):
  seed_val = 42

  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)

  training_stats = []
  training_loss = []
  validation_loss = []
  total_t0 = time.time()

  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      for step, batch in enumerate(train_dataloader):

          if step % 40 == 0 and not step == 0:
              elapsed = format_time(time.time() - t0)

              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          model.zero_grad()

          outputs = model(b_input_ids,
                              attention_mask=b_input_mask,
                              labels=b_labels)

          total_train_loss += outputs.loss.item()

          outputs.loss.backward()

          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          optimizer.step()

          scheduler.step()
          # if step % 40 == 0 and not step == 0:
          #     print("")
          #     print("  Current loss: {0:.2f}".format(outputs.loss.item()))

      avg_train_loss = total_train_loss / len(train_dataloader)

      training_time = format_time(time.time() - t0)
      training_loss.append(avg_train_loss)
      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epcoh took: {:}".format(training_time))

      # ========================================
      #               Validation
      # ========================================

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0

      total_predictions = []
      total_logits = []
      for batch in validation_dataloader:

          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          with torch.no_grad():

              outputs = model(b_input_ids,
                                    attention_mask=b_input_mask,
                                    labels=b_labels)

          probabilities = F.softmax(outputs.logits, dim=0)
          predicted_class = torch.argmax(probabilities, dim=1)
          total_predictions = total_predictions + predicted_class.tolist()
          total_eval_loss += outputs.loss.item()

          # Move logits and labels to CPU
          logits = outputs.logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          total_logits.append(logits)
          # Calculate the accuracy for this batch of test sentences, and
          # accumulate it over all batches.
          total_eval_accuracy += flat_accuracy(logits, label_ids)


      # Report the final accuracy for this validation run.
      avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)

      print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
      f1 = f1_score(valid_labels, total_predictions, average='micro')
      y_true_np = valid_labels.numpy()
      y_pred_np = np.array(total_predictions)


      print("F1 Score:", f1)
      report = classification_report(y_true_np, y_pred_np)

      # Print the report
      print(report)

      # Calculate the average loss over all of the batches.
      avg_val_loss = total_eval_loss / len(validation_dataloader)
      validation_loss.append(avg_val_loss)
      # Measure how long the validation run took.
      validation_time = format_time(time.time() - t0)

      print("  Validation Loss: {0:.2f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))
      # Printing the confusion matrix
      print('Confusion Matrix:')
      print(confusion_matrix(y_true_np, y_pred_np))
      print(len(y_true_np))
      plot_roc_curve(y_true_np, total_logits)
      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Valid. Accur.': avg_val_accuracy,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )
  # Plot results
  plt.figure(figsize=(12, 5))
  plt.plot(training_loss, label='Train Loss')
  plt.plot(validation_loss, label='Validation Loss')
  plt.title('Learning Curve')
  plt.xlabel('Epochs', fontsize=12)
  plt.ylabel('Cross Entropy Loss', fontsize=12)
  plt.legend()
  plt.show()
  print("")
  print("Training complete!")

  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

  return f1



# Create output file using best parameters from optuna

I make runs with optuna and these hyperparameters comes from those runs

In [None]:
import csv

input_ids = []
attention_masks = []

for sent in test_text:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 150,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids_test = torch.cat(input_ids, dim=0)
attention_masks_test = torch.cat(attention_masks, dim=0)

test_dataset = TensorDataset(input_ids_test, attention_masks_test)

batch_size = 16

test_dataloader = DataLoader(test_dataset,sampler = RandomSampler(test_dataset),batch_size = batch_size)

model = BertForSequenceClassification.from_pretrained('nlpaueb/bert-base-greek-uncased-v1', num_labels = 3, output_attentions = False,output_hidden_states = False)

model.to(device)

epochs = 3

total_steps = len(test_dataloader) * epochs

optimizer = AdamW(model.parameters(), lr = 4e-5,eps = 3.5e-9 )

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0,num_training_steps = total_steps)

training(model,3,scheduler,train_dataloader,validation_dataloader,total_steps,optimizer)

In [None]:
predictions = []

for batch in test_dataloader:

          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)

          with torch.no_grad():

              outputs = model(b_input_ids,attention_mask=b_input_mask,)

          probabilities = F.softmax(outputs.logits, dim=0)
          predicted_class = torch.argmax(probabilities, dim=1)
          predictions = predictions + predicted_class.tolist()

number_to_sentiment = {
    0: "NEGATIVE",
    1: "NEUTRAL",
    2: "POSITIVE",
}

array = [number_to_sentiment[value] for value in predictions]

with open("submission.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Id", "Predicted"])

    for idx, word in enumerate(array, start=1):
        writer.writerow([idx, word])

# Optuna

In [None]:
%pip install optuna

In [None]:
import optuna
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# 1. Define an objective function to be maximized.
def objective(trial):
    GreekBert = BertForSequenceClassification.from_pretrained('nlpaueb/bert-base-greek-uncased-v1', num_labels = 3, output_attentions = False,output_hidden_states = False)

    GreekBert.to('cuda')

    epochs = trial.suggest_int('epochs',1,3)
    LR = trial.suggest_float('LR', 2e-6,2e-4)
    batchsize = trial.suggest_categorical('batchsize',[16,32])
    eps = trial.suggest_float('eps',1e-9, 1e-7)

    train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset),batch_size = batch_size)

    validation_dataloader = DataLoader(valid_dataset, sampler = SequentialSampler(valid_dataset), batch_size = batch_size )

    optimizer = AdamW(GreekBert.parameters(), lr = LR, eps = eps )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0,num_training_steps = total_steps)

    f1 = training(GreekBert,epochs,scheduler,train_dataloader,validation_dataloader,total_steps,optimizer)
    torch.cuda.empty_cache()
    return f1

In [None]:
def Distilobjective(trial):
    DistilGreekBert = DistilBertForSequenceClassification.from_pretrained("EftychiaKarav/DistilGREEK-BERT", num_labels = 3,output_attentions = False,output_hidden_states = False)

    DistilGreekBert.to('cuda')

    epochs = trial.suggest_int('epochs',1,3)
    LR = trial.suggest_float('LR', 2e-6,2e-4)
    batchsize = trial.suggest_categorical('batchsize',[16,32])
    eps = trial.suggest_float('eps',1e-9, 1e-7)

    train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset),batch_size = batch_size)

    validation_dataloader = DataLoader(valid_dataset, sampler = SequentialSampler(valid_dataset), batch_size = batch_size )

    optimizer = AdamW(DistilGreekBert.parameters(), lr = LR, eps = eps )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0,num_training_steps = total_steps)

    f1 = training(DistilGreekBert,epochs,scheduler,train_dataloader,validation_dataloader,total_steps,optimizer)
    # torch.cuda.empty_cache()
    return f1

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1) # 1 for faster submission
print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(Distilobjective, n_trials=1) # 1 for faster submission
print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

I payed 24 euros for this assignment to buy gpu units in google collab

Code inspired from https://mccormickml.com/2019/07/22/BERT-fine-tuning/