In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch.nn as nn
import torch
from sklearn.metrics import accuracy_score, classification_report

In [None]:
data = pd.read_csv("reviews.csv")
restaurants_names = data["business_name"]
label = restaurants_names.values.tolist()
classes = restaurants_names.drop_duplicates().values.tolist()
reviews = data["text"]
reviews = reviews.values.tolist()
labels, dic = restaurants_names.factorize()

In [None]:
class RestaurantRecommendationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors = "pt", max_length = self.max_length, padding="max_length", truncation = True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
class BERTREC(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTREC, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)

In [None]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased", return_tensors = "pt")
train_dataset = RestaurantRecommendationDataset(train_texts, train_labels, tokenizer, max_length = 512)
validation_dataset = RestaurantRecommendationDataset(val_texts, val_labels, tokenizer, max_length = 512)
train_dataloader = DataLoader(train_dataset, batch_size=10)
validation_dataloader = DataLoader(validation_dataset, batch_size=10)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTREC(bert_model_name = "bert-base-uncased", num_classes = 100).to(device)

In [None]:
learning_rate = 2e-3
num_epochs = 10000
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.load_state_dict(torch.load("model_weights.pth"))
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, validation_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    if epoch % 10 == 0:
        torch.save(model.state_dict(), './model_weights.pth')