# Text Classification with Pre-Trained Transformer Models


In [1]:
model_name = "google-bert/bert-base-uncased"

In [2]:
import pandas as pd

train_df = pd.read_csv("../Recipes_Training.csv", delimiter=";")
test_df = pd.read_csv("../Recipes_Test.csv", delimiter=";")

## 1. Dataset preprocessing

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
# Convert the list of ingredients to a string for each row
train_df["ingredients_str"] = train_df["ingredients"].apply(lambda x: " ".join(eval(x)))
test_df["ingredients_str"] = test_df["ingredients"].apply(lambda x: " ".join(eval(x)))

# Encode the cuisine label using LabelEncoder
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["cuisine"])
test_df["label"] = label_encoder.transform(test_df["cuisine"])

# Check the label encoding
label_encoder.classes_

array(['cajun_creole', 'chinese', 'french', 'indian', 'italian',
       'mexican', 'southern_us', 'thai'], dtype=object)

## 2. Dataset preparation

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

In [7]:
class CuisineDataset(Dataset):
	def __init__(self, df, tokenizer, max_len):
		self.ingredients = df["ingredients_str"].values
		self.labels = df["label"].values
		self.tokenizer = tokenizer
		self.max_len = max_len
	
	def __len__(self):
		return len(self.ingredients)
	
	def __getitem__(self, idx):
		ingredient = self.ingredients[idx]
		label = self.labels[idx]
		
		encoding = self.tokenizer.encode_plus(
			ingredient,
			truncation=True,
			add_special_tokens=True,
			max_length=self.max_len,
			return_token_type_ids=False,
			padding="max_length",
			return_attention_mask=True,
			return_tensors="pt",
		)
		
		return {
			"input_ids": encoding["input_ids"].flatten(),
			"attention_mask": encoding["attention_mask"].flatten(),
			"label": torch.tensor(label, dtype=torch.long)
		}

MAX_LEN = 128
BATCH_SIZE = 16

# Create the datasets
train_dataset = CuisineDataset(train_df, tokenizer, MAX_LEN)
test_dataset = CuisineDataset(test_df, tokenizer, MAX_LEN)

# Create the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## 4. Fine-tune a Pre-trained Transformer Model

In [8]:
from transformers import DistilBertForSequenceClassification, AdamW
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score

In [9]:
# Move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)

Using device: cuda


In [10]:
# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = CrossEntropyLoss().to(device)



## 5. Train the Model

In [11]:
import sys

def train_epoch(epoch, model, data_loader, loss_fn, optimizer, device, total_epochs):
	model.train()

	# Initialize variables to track loss and accuracy
	total_loss = 0
	correct_predictions = 0

	# Get the number of steps and total dataset size
	total_steps = len(data_loader)
	dataset_size = len(data_loader.dataset)

	# current epoch, which we will log
	current_epoch = epoch + 1

	# Iterate through each batch in the data loader
	for batch_idx, batch in enumerate(data_loader):
		
		# Calculate current sample index
		current_sample = batch_idx * data_loader.batch_size + len(batch["input_ids"])

		# Calculate the progress and display it
		sys.stdout.write(f'\rEpoch {current_epoch}/{EPOCHS} [{current_sample}/{dataset_size}]')
		sys.stdout.flush()

		# Move input data and labesl to the GPU
		input_ids = batch['input_ids'].to(device)
		attention_mask = batch['attention_mask'].to(device)
		labels = batch['label'].to(device)

		# Zero the gradients for the optimizer
		optimizer.zero_grad()

		# Forward pass through the model
		outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
		loss = outputs.loss
		logits = outputs.logits

		# Backward pass and optimization step
		loss.backward()
		optimizer.step()
		
		# Update total loss
		total_loss += loss.item()

		# Calculate accuracy for the current batch
		_, preds = torch.max(logits, dim=1)
		correct_predictions += torch.sum(preds == labels)

	avg_accuracy = correct_predictions.double() / dataset_size
	avg_loss = total_loss / total_steps
	
	sys.stdout.write(f'\rEpoch {current_epoch}/{total_epochs} [{dataset_size}/{dataset_size}] - train_loss: {avg_loss:.3f} ; train_acc: {avg_accuracy:.3f}\n')
	sys.stdout.flush()

	return avg_accuracy, avg_loss


#* Main training loop
EPOCHS = 3
for epoch in range(EPOCHS):
	train_acc, train_loss = train_epoch(epoch, model, train_dataloader, loss_fn, optimizer, device, EPOCHS)

Epoch 1/3 [8000/8000] - train_loss: 1.143 ; train_acc: 0.630
Epoch 2/3 [8000/8000] - train_loss: 0.597 ; train_acc: 0.811
Epoch 3/3 [8000/8000] - train_loss: 0.486 ; train_acc: 0.843


## 6. Evaluation

In [12]:
from sklearn.metrics import accuracy_score, f1_score, classification_report


def eval_model(model, data_loader, loss_fn, device, label_encoder):
	model.eval()
	correct_predictions = 0
	total_loss = 0

	all_preds = []
	all_labels = []

	with torch.no_grad():
		for batch in data_loader:
			input_ids = batch['input_ids'].to(device)
			attention_mask = batch['attention_mask'].to(device)
			labels = batch['label'].to(device)

			outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
			loss = outputs.loss
			logits = outputs.logits

			total_loss += loss.item()

			_, preds = torch.max(logits, dim=1)
			correct_predictions += torch.sum(preds == labels)

			# Store predictions and labels for F1 score calculation
			all_preds.extend(preds.cpu().numpy())
			all_labels.extend(labels.cpu().numpy())
		
	# Calculate accuracy
	accuracy = correct_predictions.double() / len(data_loader.dataset)

	# Calculate F1 score
	f1 = f1_score(all_labels, all_preds, average='weighted')

	# Get the class names from the label encoder
	class_names = label_encoder.classes_

	# Calculate classification report
	classification_rep = classification_report(all_labels, all_preds, target_names=class_names)

	return accuracy, total_loss / len(data_loader), f1, classification_rep


test_acc, test_loss, test_f1, classification_report = eval_model(model, test_dataloader, loss_fn, device, label_encoder)
print(f'test_acc: {test_acc:.3f} ; test_loss: {test_loss:.3f}, test_f1: {test_f1:.3f}')
print(f"\n{classification_report}")

test_acc: 0.883 ; test_loss: 0.371, test_f1: 0.883

              precision    recall  f1-score   support

cajun_creole       0.80      0.88      0.84       250
     chinese       0.94      0.92      0.93       250
      french       0.90      0.80      0.85       250
      indian       0.95      0.96      0.96       250
     italian       0.88      0.85      0.87       250
     mexican       0.95      0.91      0.93       250
 southern_us       0.73      0.83      0.78       250
        thai       0.93      0.90      0.91       250

    accuracy                           0.88      2000
   macro avg       0.89      0.88      0.88      2000
weighted avg       0.89      0.88      0.88      2000



**Save the model**

In [13]:
current_datetime = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
model_name_clean = model_name.replace("/", "_")
file_path = f"../models/{model_name_clean}_{current_datetime}"

torch.save(model.state_dict(), file_path)
print(f"✅ Model saved to {file_path}")

✅ Model saved to ../models/google-bert_bert-base-uncased_2024-10-24_22-39-04
