# Trending Topics in AI
Assignment 1: Apply and fine-tune transformer models

In [1]:
# Check if GPU is available
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
import pandas as pd

train_df = pd.read_csv("./Recipes_Training.csv", delimiter=";")
test_df = pd.read_csv("./Recipes_Test.csv", delimiter=";")

## 1. Load pre-trained model

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


## 2. Dataset preprocessing

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
# Convert the list of ingredients to a string for each row
train_df["ingredients_str"] = train_df["ingredients"].apply(lambda x: " ".join(eval(x)))
test_df["ingredients_str"] = test_df["ingredients"].apply(lambda x: " ".join(eval(x)))

# Encode the cuisine label using LabelEncoder
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["cuisine"])
test_df["label"] = label_encoder.transform(test_df["cuisine"])

# Check the label encoding
label_encoder.classes_

array(['cajun_creole', 'chinese', 'french', 'indian', 'italian',
       'mexican', 'southern_us', 'thai'], dtype=object)

## 3. Dataset preparing

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

In [7]:
class CuisineDataset(Dataset):
	def __init__(self, df, tokenizer, max_len):
		self.ingredients = df["ingredients_str"].values
		self.labels = df["label"].values
		self.tokenizer = tokenizer
		self.max_len = max_len
	
	def __len__(self):
		return len(self.ingredients)
	
	def __getitem__(self, idx):
		ingredient = self.ingredients[idx]
		label = self.labels[idx]
		
		encoding = self.tokenizer.encode_plus(
			ingredient,
			truncation=True,
			add_special_tokens=True,
			max_length=self.max_len,
			return_token_type_ids=False,
			padding="max_length",
			return_attention_mask=True,
			return_tensors="pt",
		)
		
		return {
			"input_ids": encoding["input_ids"].flatten(),
			"attention_mask": encoding["attention_mask"].flatten(),
			"label": torch.tensor(label, dtype=torch.long)
		}

MAX_LEN = 128
BATCH_SIZE = 16

# Create the datasets
train_dataset = CuisineDataset(train_df, tokenizer, MAX_LEN)
test_dataset = CuisineDataset(test_df, tokenizer, MAX_LEN)

# Create the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## 4. Fine-tune a Pre-trained Transformer Model

In [8]:
from transformers import RobertaForSequenceClassification, AdamW
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score

In [9]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_encoder.classes_))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)

Using device: cuda


In [11]:
# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = CrossEntropyLoss().to(device)



## 5. Train the Model

In [12]:
import sys

def train_epoch(model, data_loader, loss_fn, optimizer, device):
	model.train()
	total_loss = 0
	correct_predictions = 0

	total_steps = len(data_loader)
	dataset_size = len(data_loader.dataset)

	for batch_idx, batch in enumerate(data_loader):
		
		# Calculate current sample index
		current_sample = batch_idx * data_loader.batch_size + len(batch["input_ids"])

		# Calculate the progress
		sys.stdout.write(f'\rBatch {current_sample}/{dataset_size}')
		sys.stdout.flush()


		input_ids = batch['input_ids'].to(device)
		attention_mask = batch['attention_mask'].to(device)
		labels = batch['label'].to(device)

		# Zero the gradients
		optimizer.zero_grad()

		# Forward pass
		outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
		loss = outputs.loss
		logits = outputs.logits

		# Backward pass and optimization
		loss.backward()
		optimizer.step()
		
		total_loss += loss.item()

		# Calculate accuracy
		_, preds = torch.max(logits, dim=1)
		correct_predictions += torch.sum(preds == labels)

	return correct_predictions.double() / len(data_loader.dataset), total_loss / len(data_loader)

EPOCHS = 3
for epoch in range(EPOCHS):
	print(f'\nEpoch {epoch+1}/{EPOCHS}')
	train_acc, train_loss = train_epoch(model, train_dataloader, loss_fn, optimizer, device)
	print(f'\nEpoch {epoch+1}/{EPOCHS}, Training Loss: {train_loss}, Training Accuracy: {train_acc}')


Epoch 1/3
Batch 8000/8000
Epoch 1/3, Training Loss: 1.3070824217200279, Training Accuracy: 0.52425

Epoch 2/3
Batch 8000/8000
Epoch 2/3, Training Loss: 0.7407156108915806, Training Accuracy: 0.7445

Epoch 3/3
Batch 8000/8000
Epoch 3/3, Training Loss: 0.6126032767742873, Training Accuracy: 0.789625


## 6. Evaluation

In [13]:
from sklearn.metrics import accuracy_score, f1_score


def eval_model(model, data_loader, loss_fn, device):
	model.eval()
	correct_predictions = 0
	total_loss = 0

	all_preds = []
	all_labels = []

	with torch.no_grad():
		for batch in data_loader:
			input_ids = batch['input_ids'].to(device)
			attention_mask = batch['attention_mask'].to(device)
			labels = batch['label'].to(device)

			outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
			loss = outputs.loss
			logits = outputs.logits

			total_loss += loss.item()

			_, preds = torch.max(logits, dim=1)
			correct_predictions += torch.sum(preds == labels)

			# Store predictions and labels for F1 score calculation
			all_preds.extend(preds.cpu().numpy())
			all_labels.extend(labels.cpu().numpy())
		
	# Calculate accuracy
	accuracy = correct_predictions.double() / len(data_loader.dataset)

	# Calculate F1 score
	f1 = f1_score(all_labels, all_preds, average='weighted')

	return accuracy, total_loss / len(data_loader), f1


test_acc, test_loss, test_f1 = eval_model(model, test_dataloader, loss_fn, device)
print(f'Test Accuracy: {test_acc:.3f}, Test Loss: {test_loss:.3f}, Test F1 Score: {test_f1:.3f}')

Test Accuracy: 0.850, Test Loss: 0.451, Test F1 Score: 0.850
