## Import

In [None]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from sklearn.model_selection import train_test_split

In [None]:
import urllib
url = "https://raw.githubusercontent.com/Jwizzed/ml-journey/main/TT.py"
urllib.request.urlretrieve(url, "TT.py")

In [None]:
import TT

In [None]:
import huggingface_hub
huggingface_hub.login()

In [None]:
import torch

# Set the device
device = "mps" if torch.backends.mps.is_available() else "cpu"

# Create data and send it to the device
x = torch.rand(size=(3, 4)).to(device)
x

## Get data

In [None]:
!kaggle datasets download -d nelgiriyewithana/mcdonalds-store-reviews

In [None]:
!ls

In [None]:
TT.unzip("mcdonalds-store-reviews.zip", delete_original=True)

In [None]:
orig_df = pd.read_csv("McDonald_s_Reviews.csv", encoding_errors="ignore", skiprows=lambda i: i % 50 != 0)
df = orig_df.copy()
df.head()

In [None]:
df.category.unique()

In [None]:
df.store_name.unique()

In [None]:
df[["rating_count", "rating"]].tail()

In [None]:
df = df[["review", "rating"]]
df

## Data Preprocess

In [None]:
df.isna().sum(), df.shape

In [None]:
df.dropna(inplace=True)
df.shape

In [None]:
df.drop_duplicates("review", keep="last", inplace=True)
df.rename(columns={"rating":"labels", "review":"text"}, inplace=True)
df.head()

In [None]:
df.labels.unique()

In [None]:
df["labels"] = df["labels"].replace({"1 star":0, "2 stars": 1, "3 stars": 2, "4 stars": 3, "5 stars": 4 })
df = df.loc[df['text'].str.contains(r'[^\x00-\x7F]+') == False]
df.head()

In [None]:
import datasets

In [None]:
max_length = 128 
batch_size = 32 

dataset = datasets.Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

In [None]:
id2label = {0: "1 star", 1: "2 stars", 2: "3 stars", 3: "4 stars", 4: "2 stars"}
label2id = {"1 star":0, "2 stars":1, "3 stars":2, "4 stars":3, "5 stars":4}

In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [None]:
class ReviewData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = dataframe["text"]
        self.targets = dataframe["labels"]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'targets': torch.tensor(self.targets[index], dtype=torch.float).to(device)
        }


In [None]:
training_set = ReviewData(dataset["train"], tokenizer, MAX_LEN)
testing_set = ReviewData(dataset["test"], tokenizer, MAX_LEN)
for batch in training_set:
    break
batch

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Create Model

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

## Fine Tuning

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
def train(epoch):
    tr_loss = 0 # Accumulated training loss for the current epoch.
    n_correct = 0 # Number of correct predictions during the epoch.
    nb_tr_steps = 0 # Total number of training steps within the current epoch.
    nb_tr_examples = 0 # Total number of training examples processed in the current epoch.
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    return 

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)