# XLM-RoBERTa
*Time for run all (GPU): ~4 hours*

## 1. General Settings and Import Libraries

In [1]:
import pprint
import numpy as np
import pandas as pd

from sklearn import preprocessing, metrics

import torch
from torch import Tensor
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torchtext
import torchtext.transforms as T
import torchtext.functional as F
from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER

import pickle

import nltk
from nltk.corpus import stopwords

## 2. Data Preparation 

In [2]:
df = pd.read_csv("./datasettt.csv")
# columns_to_remove = ['Unnamed: 2','Unnamed: 3','Unnamed: 4']
# df = df.drop(columns=columns_to_remove)

In [3]:
df

Unnamed: 0,Comment,Label
0,Love you sir!!,2
1,"Thank you very much, u really got me in the fi...",2
2,Another great explanation by Abdul sir. Thank ...,2
3,I had no idea what was going on in the first o...,2
4,Thankkk youuuuu soooo sooo much sir,2
...,...,...
2522,"Practise, practise, practise, I couldn't agree...",0
2523,Need a small hep from you. Have my GRE in 3 da...,0
2524,I failed at last question ðŸ˜¢,0
2525,Thatâ€™s true tht happen to me the first thing...,0


In [4]:
# lowercasing
df['Comment'] = df['Comment'].str.lower()

# removing urls
df['Comment'] = df['Comment'].str.replace('http\S+|www.\S+', '', case=False)

# removing commas "\n"
df['Comment'] = df['Comment'].replace('\n','', regex=True)

# removing all the punctuations
df['Comment'] = df['Comment'].str.replace('[^\w\s]','')

# removing integers
#df['Comment'] = df['Comment'].replace('\d','', regex=True)

# removing emojis
#df['Comment'] = df['Comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
df

Unnamed: 0,Comment,Label
0,love you sir!!,2
1,"thank you very much, u really got me in the fi...",2
2,another great explanation by abdul sir. thank ...,2
3,i had no idea what was going on in the first o...,2
4,thankkk youuuuu soooo sooo much sir,2
...,...,...
2522,"practise, practise, practise, i couldn't agree...",0
2523,need a small hep from you. have my gre in 3 da...,0
2524,i failed at last question ðÿ˜¢,0
2525,thatâ€™s true tht happen to me the first thing...,0


In [5]:
from autocorrect import Speller

spell = Speller(lang='en')

def typo_corrector(text):
    return spell(text)

df['Comment'] = df['Comment'].apply(typo_corrector)

In [6]:
# stemming
import nltk
stemmer = nltk.stem.SnowballStemmer('english')
def stem_text(text):
    return stemmer.stem(text)

df['Comment'] = df['Comment'].apply(stem_text)


# lemmatizing
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return lemmatizer.lemmatize(text)

df['Comment'] = df['Comment'].apply(lemmatize_text)

In [7]:
# Removing stopwords
# nltk.download('stopwords')

stop = stopwords.words('english')

# These are the words that should not be removed from their category
negative_words = ['no','not']
neutral_words = ['how','what','which','who','whom','why','do','does','is','are','was','were','will','am',
                      'are','could','would','should','can','did','does','do','had','have']

for_negative_category = stop.copy()
for word in negative_words:
    if word in for_negative_category:
        for_negative_category.remove(word)
    
for_neutral_category = stop.copy()
for word in neutral_words:
    if word in for_neutral_category:
        for_neutral_category.remove(word)

# For negative category
for i in range(len(df)):
    if df["Label"][i] == "negative":
        df["Comment"][i] = ' '.join([word for word in df["Comment"][i].split() if word not in for_negative_category])

# For neutral category
for i in range(len(df)):
    if df["Label"][i] == "neutral":
        df["Comment"][i] = ' '.join([word for word in df["Comment"][i].split() if word not in for_neutral_category])
        
# For positive category
for i in range(len(df)):
    if df["Label"][i] == "positive":
        df["Comment"][i] = ' '.join([word for word in df["Comment"][i].split() if word not in stop])


In [8]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
train_df

Unnamed: 0,Comment,Label
2266,i'm getting a 'nameerror: name 'shuffle' is no...,0
983,wonderful video ðÿ'ðÿ'ðÿ',2
194,best lecture i have ever seen,2
189,that is very very very helpful...thank you sir,2
715,i have never learned programming formally and ...,2
...,...,...
1033,i bought both of your courses on dmy. you are ...,1
1731,"hi i am using windows 10, 64 bit os & i am get...",1
763,hello from india ðÿ‡®ðÿ‡³. loved your channel!...,2
835,i have run my first program success,2


In [10]:
test_df

Unnamed: 0,Comment,Label
478,thanks alice. really nice tutorial.,2
1830,hey i installed the jk file on my windows 10 a...,1
1804,can anyone tell me what was the purpose of thi...,1
997,very helpful video,2
1821,why would we learn this? anybody still learn e...,1
...,...,...
1850,hi i have problem on w7 it doesnt show this hu...,1
700,thank you for creating a free structured progr...,2
1902,thanks for this wonderful presentation have ca...,1
1873,"my system is 32 bits, can i still use 64 bits ...",1


In [11]:
label_encoder = preprocessing.LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])

In [12]:
test_df['Label'] = label_encoder.transform(test_df['Label'])

In [13]:
class PadTransform(torch.nn.Module):
    """Pad tensor to a fixed length with given padding value.
    :param max_length: Maximum length to pad to
    :type max_length: int
    :param pad_value: Value to pad the tensor with
    :type pad_value: bool
    """

    def __init__(self, max_length: int, pad_value: int) -> None:
        super().__init__()
        self.max_length = max_length
        self.pad_value = float(pad_value)
        
    def forward(self, x: Tensor) -> Tensor:
        """
        :param x: The tensor to pad
        :type x: Tensor
        :return: Tensor padded up to max_length with pad_value
        :rtype: Tensor
        """
        max_encoded_length = x.size(-1)
        if max_encoded_length < self.max_length:
            pad_amount = self.max_length - max_encoded_length
            x = torch.nn.functional.pad(x, (0, pad_amount), value=self.pad_value)
        return x

In [14]:
padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256

In [15]:
text_transform = torchtext.models.XLMR_LARGE_ENCODER.transform()

In [16]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(text_transform(self.df.iloc[idx, 0])),
            torch.tensor(self.df.iloc[idx, 1])
        )

In [17]:
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

In [18]:
SMALL_BATCH_SIZE = 16
K = 2
LARGE_BATCH_SIZE = K * SMALL_BATCH_SIZE

In [19]:
def batch_collate_fn(batch):
    inp_list = list()
    tar_list = list()
    
    for sample in batch:
        inp_list.append(sample[0].tolist())
        tar_list.append(sample[1])
        
    padded_tensor = F.to_tensor(inp_list, padding_value=padding_idx)
    target_tensor = torch.stack(tar_list).type(torch.LongTensor)
    
    return padded_tensor, target_tensor

In [20]:
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=SMALL_BATCH_SIZE,
    shuffle=True,
    collate_fn=batch_collate_fn
)

In [21]:
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=SMALL_BATCH_SIZE,
    shuffle=False,
    collate_fn=batch_collate_fn
)

In [22]:
num_classes = 3
input_dim = 768

In [23]:
classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
model = XLMR_BASE_ENCODER.get_model(head=classifier_head)

In [24]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [25]:
model.to(DEVICE)
pass  

In [26]:
learning_rate = 1.2e-5 
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = torch.nn.CrossEntropyLoss()

## 3. Model Training and Evaluation

In [27]:
def train_step(input, target, small_batch_no):
    output = model(input)
    loss = criteria(output, target)
    loss.backward()
    
    if (small_batch_no + 1) % K == 0 or (small_batch_no + 1) == len(train_dataloader):
        optim.step()
        optim.zero_grad()
    
    return loss.item() / input.size(dim=0)

def evaluate():
    model.eval()
    total_loss = 0
    counter = 0
    
    with torch.no_grad():
        for i, batch in enumerate(test_dataloader):
            input = batch[0].clone().detach().to(DEVICE)
            output = model(input)
            target = batch[1].clone().detach().to(DEVICE)
            
            if i == 0:
                class_output = torch.argmax(output, dim=1)
                class_target = target
            else:
                class_output = torch.cat([class_output, torch.argmax(output, dim=1)])
                class_target = torch.cat([class_target, target])
            
            loss = criteria(output, target).item()
            total_loss += loss
            counter += input.size(dim=0)
            
            
        confusion_matrix = metrics.confusion_matrix(
            class_target.cpu().numpy().flatten(),
            class_output.cpu().numpy().flatten(),
            labels=[0, 1, 2]
        ) 
        classification_report = metrics.classification_report(
            class_target.cpu().numpy().flatten(),
            class_output.cpu().numpy().flatten(),
            labels=[0, 1, 2],
            output_dict=True
        )

    return (
        total_loss,
        counter,
        confusion_matrix,
        classification_report
    )

In [28]:
num_epochs = 20

In [29]:
def save_log(epoch, **kwargs):
    with open(f'log_{epoch}.pkl', 'wb') as f:
        pickle.dump(kwargs, f)

def save_model(message):
    torch.save(model.state_dict(), f'model_{message}.pth')
    print('Model saved successfully')

In [30]:
max_accuracy = float('-inf')
max_macro_f1 = float('-inf')
max_weighted_f1 = float('-inf')

In [None]:
for epoch in range(num_epochs):
    model.train()
    avg_training_losses = list()
    
    for small_batch_no, small_batch in enumerate(train_dataloader):
        input = small_batch[0].clone().detach().to(DEVICE)
        target = small_batch[1].clone().detach().to(DEVICE)
        avg_training_losses.append(
            train_step(input, target, small_batch_no)
        )
        torch.cuda.empty_cache()

    total_loss, counter, confusion_matrix, classification_report = evaluate()
    print(f'EPOCH {epoch}')
    print(f'Mean of avg_training_losses={np.mean(avg_training_losses)}')
    print(f'total_loss={total_loss}')
    print(f'counter={counter}')
    print(f'loss=total_loss/counter={total_loss/counter}')
    print(f'confusion_matrix=\n{confusion_matrix}')
    print('classification_report=')
    pprint.pprint(classification_report)
    
    save_log(
        epoch,
        avg_training_losses=avg_training_losses,
        total_loss=total_loss,
        counter=counter,
        loss=total_loss/counter,
        confusion_matrix=confusion_matrix,
        classification_report=classification_report
    )
    
    if classification_report['accuracy'] > max_accuracy:
        print(f'New max_accuracy')
        max_accuracy = classification_report['accuracy']
        max_accuracy_index = epoch
        save_model('max_accuracy')
        
    elif classification_report['macro avg']['f1-score'] > max_macro_f1:
        print(f'New max_macro_f1')
        max_macro_f1 = classification_report['macro avg']['f1-score']
        max_macro_f1_index = epoch
        save_model('max_macro_f1')
    
    elif classification_report['weighted avg']['f1-score'] > max_weighted_f1:
        print(f'New max_weighted_f1')
        max_weighted_f1 = classification_report['weighted avg']['f1-score']
        max_weighted_f1_index = epoch
        save_model('max_weighted_f1')
    
    elif epoch == num_epochs - 1:
        save_model(f'{epoch}_last')
    
    elif epoch % 40 == 0:
        save_model(f'{epoch}_checkpoint')
        
    else:
        continue

save_log(
    'post_train_info',
    max_accuracy=max_accuracy,
    max_accuracy_index=max_accuracy_index,
    max_macro_f1=max_macro_f1,
    max_macro_f1_index=max_macro_f1_index,
    max_weighted_f1=max_weighted_f1,
    max_weighted_f1_index=max_weighted_f1_index
)

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


EPOCH 0
Mean of avg_training_losses=0.06632268078566536
total_loss=29.28060233592987
counter=506
loss=total_loss/counter=0.057866803035434525
confusion_matrix=
[[  0  50  47]
 [  0 141  61]
 [  0   3 204]]
classification_report=
{'0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 97},
 '1': {'f1-score': 0.712121212121212,
       'precision': 0.7268041237113402,
       'recall': 0.698019801980198,
       'support': 202},
 '2': {'f1-score': 0.7861271676300579,
       'precision': 0.6538461538461539,
       'recall': 0.9855072463768116,
       'support': 207},
 'accuracy': 0.6818181818181818,
 'macro avg': {'f1-score': 0.4994161265837566,
               'precision': 0.4602167591858313,
               'recall': 0.5611756827856699,
               'support': 506},
 'weighted avg': {'f1-score': 0.6058830208456656,
                  'precision': 0.5576296182526573,
                  'recall': 0.6818181818181818,
                  'support': 506}}
New max_accuracy
Model saved su

*Maximum recorded accuracy = 75.3%*

### 4. Demonstration

In [None]:
def prepare_model():
    num_classes = 3
    input_dim = 768

    classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
    model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
    
    DEMO_MODEL_PATH = 'model_max_weighted_f1.pth'
    model.load_state_dict(torch.load(DEMO_MODEL_PATH))
    model.to(DEVICE)
    
    print(f'Loaded model to [{DEVICE}] in [{DEMO_MODEL_PATH}]')
    return model

In [None]:
def prepare_text_transform():
    text_transform = torchtext.models.XLMR_LARGE_ENCODER.transform()
    return text_transform

In [None]:
def predict(sentence, model, text_transform, label_map):
    transformed_text = text_transform(sentence)
    out = model(torch.tensor([transformed_text]).to(DEVICE))
    return label_map[torch.argmax(out).item()]

In [None]:
label_map = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}

In [None]:
model = prepare_model()
text_transform = prepare_text_transform()

In [None]:
sample_text = 'dumb ass'
predict(sample_text, model, text_transform, label_map)