### Introduction

- Task:
	- Build a text classifier to predict the content category of a FB post based on its textual content.

- Language Model Used:
	- The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google’s BERT model released in 2018.
	- [Blog-Post](https://ai.facebook.com/blog/roberta-an-optimized-method-for-pretraining-self-supervised-nlp-systems/)
	- [Research Paper](https://arxiv.org/pdf/1907.11692)
	- [Documentation for python](https://huggingface.co/transformers/model_doc/roberta.html)




In [None]:
!pip install transformers
#==3.0.2

In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)
%cd '/content/drive/My Drive/Colab Notebooks'

Mounted at /content/drive/
/content/drive/My Drive/Colab Notebooks


In [None]:
fb = pd.read_csv("fb.csv").drop(columns = "Unnamed: 0")

In [None]:
fb.head()

Unnamed: 0,postId,message,Appreciation,Complaint,Feedback,label,label_encode
0,126016648090_10150802142013091,Great ! ;),1,0,0,Appreciation,0
1,108381603303_10151136215833304,YUM! YUM!,1,0,0,Appreciation,0
2,108381603303_3913438087739,Yummm :)),1,0,0,Appreciation,0
3,110455108974424_343049739048292,sweet,1,0,0,Appreciation,0
4,110455108974424_350358541650745,nice,1,0,0,Appreciation,0


In [None]:
un = pd.read_csv("fb_un.csv")

In [None]:
un.head()

Unnamed: 0,postId,message,label_encode
0,108381603303_10151119973393304,Love. It. To,0
1,115568331790246_371841206162956,NICE,1
2,115568331790246_515044031842672,Congrats,2
3,147285781446_10151010892176447,Awesome!,3
4,159616034235_10150639103634236,Award,4


In [None]:
new_df = fb[['message', 'label_encode']].rename(columns = {"message":"Phrase","label_encode":"Sentiment"})

### Data Augmentation

In [None]:
!pip install textattack

In [None]:
from textattack.augmentation import WordNetAugmenter, EmbeddingAugmenter
from sklearn.utils import shuffle

def augment_text(df,label_encode):
  wordnet_aug = EmbeddingAugmenter()
  #wordnet_aug = WordNetAugmenter()
  new_text=[]

  ##dropping samples from validation
  df_n=df[df.Sentiment==label_encode].reset_index(drop=True)

  ## data augmentation loop
  for i in tqdm(range(0,len(df_n))):

    text = df_n.iloc[i]['Phrase']
    if len(text) < 1000:
      augmented_text = wordnet_aug.augment(text)[0]
      new_text.append(augmented_text)

  ## dataframe
  new=pd.DataFrame({'Phrase':new_text,'Sentiment':label_encode})
  df=shuffle(df.append(new).reset_index(drop=True))
  return df

In [None]:
aug_df_em = augment_text(new_df, 2)

textattack: Downloading https://textattack.s3.amazonaws.com/word_embeddings/paragramcf.
100%|██████████| 481M/481M [00:39<00:00, 12.2MB/s]
textattack: Unzipping file /root/.cache/textattack/tmpsm4_ur3k.zip to /root/.cache/textattack/word_embeddings/paragramcf.
textattack: Successfully saved word_embeddings/paragramcf to cache.
100%|██████████| 1644/1644 [46:08<00:00,  1.68s/it]
  df=shuffle(df.append(new).reset_index(drop=True))


In [None]:
aug_df = aug_df_em

In [None]:
from sklearn.model_selection import train_test_split

stratify_cols = aug_df["Sentiment"]

train_data, test_data = train_test_split(
    aug_df,
    stratify=stratify_cols,
    test_size=0.2,
    random_state=42
)

In [None]:
new_un = un[['message', 'label_encode']].rename(columns = {"message":"Phrase","label_encode":"Sentiment"})

<a id='section03'></a>
### Preparing the Dataset and Dataloader

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase
        self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            # truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))
test_data = test_data.reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

TRAIN Dataset: (15362, 2)
TEST Dataset: (3841, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
un_set = SentimentData(new_un, tokenizer, MAX_LEN)

In [None]:
un_loader = DataLoader(un_set, **test_params)

<a id='section04'></a>
### Creating the Neural Network for Fine Tuning

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

<a id='section05'></a>
### Fine Tuning the Model

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
1it [00:03,  3.04s/it]

Training Loss per 5000 steps: 1.6410553455352783
Training Accuracy per 5000 steps: 12.5


1921it [11:05,  2.89it/s]

The Total Accuracy for Epoch 0: 87.7294623095951
Training Loss Epoch: 0.34700683149170247
Training Accuracy Epoch: 87.7294623095951





In [None]:
#Validating model on 20% data
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

acc = valid(model, testing_loader)

### Prediction on unlabeled data

In [None]:
num = []
ans = []
for _, data in tqdm(enumerate(un_loader, 0)):
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
    targets = data['targets'].to(device, dtype = torch.long)
    outputs = model(ids, mask, token_type_ids).squeeze()
    big_val, big_idx = torch.max(outputs.data, dim=1)
    num += targets.flatten().tolist()
    ans += big_idx.tolist()

un_ans = pd.DataFrame({"label_encode":num,
                       "ans":ans})

sub = un.merge(un_ans, on="label_encode", how = "left")

sub["Appreciation_pred"] = 0
sub["Complaint_pred"] = 0
sub["Feedback_pred"] = 0

sub["Appreciation_pred"] = sub["Appreciation_pred"].where(sub["ans"]!=0,1)
sub["Complaint_pred"] = sub["Complaint_pred"].where(sub["ans"]!=1,1)
sub["Feedback_pred"] = sub["Feedback_pred"].where(sub["ans"]!=2,1)

sub = sub.drop(columns = ["message","label_encode","ans"])

sub.to_csv("submission_Lan_aug"+str(EPOCHS)+".csv", index = False)

output_model_file = 'pytorch_roberta_sentiment_aug'+str(EPOCHS)+'.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)