In [32]:

import os

from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/School/Howest/TI-AI/Sem3/TrendingTopics/ex1')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Trending Topics in AI
Assignment 1: Apply and fine-tune transformer models

In [33]:
# Check if GPU is available
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [34]:
import pandas as pd

train_df = pd.read_csv("./Recipes_Training.csv", delimiter=";")
test_df = pd.read_csv("./Recipes_Test.csv", delimiter=";")

## 1. Load pre-trained model

In [35]:
train_df

Unnamed: 0,id,cuisine,ingredients
0,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma..."
1,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']"
2,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay..."
3,3735,italian,"['sugar', 'pistachio nuts', 'white almond bark..."
4,16903,mexican,"['olive oil', 'purple onion', 'fresh pineapple..."
...,...,...,...
7995,33556,cajun_creole,"['andouille sausage', 'water', 'cajun seasonin..."
7996,14725,cajun_creole,"['black pepper', 'grating cheese', 'all-purpos..."
7997,7895,cajun_creole,"['fettucine', 'cajun seasoning', 'salt', 'pepp..."
7998,23661,cajun_creole,"['chicken broth', 'crushed tomatoes', 'worcest..."


In [36]:

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 128  # Truncate sequences to 128 tokens
train_encodings = tokenizer(list(train_df['ingredients']), truncation=True, padding=True, max_length=MAX_LEN)
test_encodings = tokenizer(list(test_df['ingredients']), truncation=True, padding=True, max_length=MAX_LEN)




## 2. Dataset preprocessing

In [37]:
from sklearn.preprocessing import LabelEncoder

In [38]:
# Convert the list of ingredients to a string for each row
train_df["ingredients_str"] = train_df["ingredients"].apply(lambda x: " ".join(eval(x)))
test_df["ingredients_str"] = test_df["ingredients"].apply(lambda x: " ".join(eval(x)))

# Encode the cuisine label using LabelEncoder
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["cuisine"])
test_df["label"] = label_encoder.transform(test_df["cuisine"])

# Check the label encoding
label_encoder.classes_

array(['cajun_creole', 'chinese', 'french', 'indian', 'italian',
       'mexican', 'southern_us', 'thai'], dtype=object)

## 3. Dataset preparing

In [39]:
import torch
from torch.utils.data import Dataset, DataLoader

In [40]:
class CuisineDataset(Dataset):
	def __init__(self, df, tokenizer, max_len):
		self.ingredients = df["ingredients_str"].values
		self.labels = df["label"].values
		self.tokenizer = tokenizer
		self.max_len = max_len

	def __len__(self):
		return len(self.ingredients)

	def __getitem__(self, idx):
		ingredient = self.ingredients[idx]
		label = self.labels[idx]

		encoding = self.tokenizer.encode_plus(
			ingredient,
			truncation=True,
			add_special_tokens=True,
			max_length=self.max_len,
			return_token_type_ids=False,
			padding="max_length",
			return_attention_mask=True,
			return_tensors="pt",
		)

		return {
			"input_ids": encoding["input_ids"].flatten(),
			"attention_mask": encoding["attention_mask"].flatten(),
			"label": torch.tensor(label, dtype=torch.long)
		}

MAX_LEN = 128
BATCH_SIZE = 32

# Create the datasets
train_dataset = CuisineDataset(train_df, tokenizer, MAX_LEN)
test_dataset = CuisineDataset(test_df, tokenizer, MAX_LEN)

# Create the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## 4. Fine-tune a Pre-trained Transformer Model

In [41]:
from transformers import BertForSequenceClassification, AdamW
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score

In [42]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# Move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)

Using device: cuda


In [44]:
# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = CrossEntropyLoss().to(device)



In [45]:
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup

# Define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Calculate total steps
total_steps = len(train_dataloader) // gradient_accumulation_steps * num_epochs

# Define the scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),  # 10% of total steps for warm-up
    num_training_steps=total_steps
)


## 5. Train the Model

In [46]:
from torch.cuda.amp import autocast, GradScaler
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup

num_epochs = 3
gradient_accumulation_steps = 4
step = 0
validation_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False);

early_stopping = True
patience = 2  # Number of epochs to wait before early stopping
epochs_no_improve = 0
best_validation_loss = float('inf')


scaler = GradScaler()  # Mixed precision training

# Define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Calculate total steps
total_steps = len(train_dataloader) // gradient_accumulation_steps * num_epochs

# Define the scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),  # Warm-up for 10% of the steps
    num_training_steps=total_steps
)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Do not zero gradients here; we accumulate them
        with autocast():
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                labels=batch['label'].to(device)
            )
            loss = outputs.loss / gradient_accumulation_steps

        # Scaled backward for mixed precision
        scaler.scale(loss).backward()

        # Gradient accumulation and scaler step
        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

        total_loss += loss.item()
        step += 1  # Increment step

    # Validation step
    model.eval()
    validation_loss = 0
    with torch.no_grad():
        for batch in validation_dataloader:
            with autocast():
                outputs = model(
                    input_ids=batch['input_ids'].to(device),
                    attention_mask=batch['attention_mask'].to(device),
                    labels=batch['label'].to(device)
                )
                loss = outputs.loss
                validation_loss += loss.item()
    validation_loss /= len(validation_dataloader)

    print(f'Epoch {epoch+1}, Validation Loss: {validation_loss}')

    # Early stopping logic
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        epochs_no_improve = 0  # Reset counter
        # Optionally save the best model here
    else:
        epochs_no_improve += 1

    if early_stopping and epochs_no_improve >= patience:
        print('Stopping early due to no improvement in validation loss')
        break


  scaler = GradScaler()  # Mixed precision training
  with autocast():
  with autocast():


Epoch 1, Validation Loss: 1.0135126265268477
Epoch 2, Validation Loss: 0.7867029583643353
Epoch 3, Validation Loss: 0.7293267022995722


  scaler = GradScaler()  # Mixed precision training
  with autocast():
  with autocast():


Epoch 1, Validation Loss: 1.5787736196366569
Epoch 2, Validation Loss: 1.1274391053214905
Epoch 3, Validation Loss: 1.016219002859933


## 6. Evaluation

In [47]:
from sklearn.metrics import accuracy_score, f1_score


def eval_model(model, data_loader, loss_fn, device):
	model.eval()
	correct_predictions = 0
	total_loss = 0

	all_preds = []
	all_labels = []

	with torch.no_grad():
		for batch in data_loader:
			input_ids = batch['input_ids'].to(device)
			attention_mask = batch['attention_mask'].to(device)
			labels = batch['label'].to(device)

			outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
			loss = outputs.loss
			logits = outputs.logits

			total_loss += loss.item()

			_, preds = torch.max(logits, dim=1)
			correct_predictions += torch.sum(preds == labels)

			# Store predictions and labels for F1 score calculation
			all_preds.extend(preds.cpu().numpy())
			all_labels.extend(labels.cpu().numpy())

	# Calculate accuracy
	accuracy = correct_predictions.double() / len(data_loader.dataset)

	# Calculate F1 score
	f1 = f1_score(all_labels, all_preds, average='weighted')

	return accuracy, total_loss / len(data_loader), f1


test_acc, test_loss, test_f1 = eval_model(model, test_dataloader, loss_fn, device)
print(f'Test Accuracy: {test_acc}, Test Loss: {test_loss}, Test F1 Score: {test_f1}')

Test Accuracy: 0.724, Test Loss: 1.0161969652251592, Test F1 Score: 0.7151875052905601
