<a href="https://colab.research.google.com/github/Hesamalian/RoBERTa-classification/blob/master/Text_Classification_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader


## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
## PyTorch Transformer
!pip install pytorch_transformers
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

Collecting pytorch_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/89/ad0d6bb932d0a51793eaabcf1617a36ff530dc9ab9e38f765a35dc293306/pytorch_transformers-1.1.0-py3-none-any.whl (158kB)
[K     |██                              | 10kB 23.7MB/s eta 0:00:01[K     |████▏                           | 20kB 3.0MB/s eta 0:00:01[K     |██████▏                         | 30kB 4.3MB/s eta 0:00:01[K     |████████▎                       | 40kB 2.9MB/s eta 0:00:01[K     |██████████▍                     | 51kB 3.6MB/s eta 0:00:01[K     |████████████▍                   | 61kB 4.3MB/s eta 0:00:01[K     |██████████████▌                 | 71kB 4.9MB/s eta 0:00:01[K     |████████████████▋               | 81kB 5.5MB/s eta 0:00:01[K     |██████████████████▋             | 92kB 6.2MB/s eta 0:00:01[K     |████████████████████▊           | 102kB 4.8MB/s eta 0:00:01[K     |██████████████████████▉         | 112kB 4.8MB/s eta 0:00:01[K     |████████████████████████▉   

In [4]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


In [0]:
## Install PyTorch-Transformer

In [0]:
#!pip install -U pytorch-transformers

In [0]:
## Importing Datasets

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [10]:
!ls drive/'My Drive'/2017-06-custom-intent-engines

AddToPlaylist	GetWeather  RateBook   SearchCreativeWork
BookRestaurant	PlayMusic   README.md  SearchScreeningEvent


In [0]:
dataset_path = "drive/My Drive/2017-06-custom-intent-engines/"

In [12]:
dataset = pd.DataFrame(columns = ['utterance', 'label'])
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open(dataset_path + intent + "/train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Class: {}, # utterances: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'utterance': text, 'label': intent}, ignore_index=True)

Class: AddToPlaylist, # utterances: 300
Class: BookRestaurant, # utterances: 300
Class: GetWeather, # utterances: 300
Class: PlayMusic, # utterances: 300
Class: RateBook, # utterances: 300
Class: SearchCreativeWork, # utterances: 300
Class: SearchScreeningEvent, # utterances: 300


In [13]:
label_to_ix = {}
for label in dataset.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

{'AddToPlaylist': 0,
 'BookRestaurant': 1,
 'GetWeather': 2,
 'PlayMusic': 3,
 'RateBook': 4,
 'SearchCreativeWork': 5,
 'SearchScreeningEvent': 6}

In [0]:
## Loading RoBERTa classes

In [14]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

100%|██████████| 473/473 [00:00<00:00, 138560.26B/s]


{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 7,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [15]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

100%|██████████| 898748/898748 [00:00<00:00, 2737603.36B/s]
100%|██████████| 456318/456318 [00:00<00:00, 1460000.39B/s]


In [0]:
## Feature Preparation

In [0]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [17]:
msg = "My dog is cute!"
prepare_features(msg)

(tensor([[    0,  2387,  2335,    16, 11962,   328,     2]]),
 [1, 1, 1, 1, 1, 1, 1])

In [0]:
## Dataset Loader Classes

In [0]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.utterance[index]
        label = self.data.label[index]
        X, _  = prepare_features(utterance)
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [0]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [21]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (2100, 2)
TRAIN Dataset: (1680, 2)
TEST Dataset: (420, 2)


In [0]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [23]:
training_set.__getitem__(0)[0].shape

torch.Size([1, 8])

In [24]:
model(training_set.__getitem__(0)[0])

(tensor([[-0.0422,  0.0881,  0.2127,  0.4934,  0.2191, -0.0190, -0.1365]],
        grad_fn=<AddmmBackward>),)

In [0]:
## Training Params

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [0]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'num_workers': 1}

In [0]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [0]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [29]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
torch.max(output.data, 1)

torch.return_types.max(values=tensor([0.2931], device='cuda:0'), indices=tensor([4], device='cuda:0'))

In [30]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 2.00026798248291. Accuracy: 0.0%
Iteration: 100. Loss: 1.6270523071289062. Accuracy: 2.619047619047619%
Iteration: 200. Loss: 2.211087465286255. Accuracy: 29.761904761904763%
Iteration: 300. Loss: 2.071167469024658. Accuracy: 0.7142857142857143%
Iteration: 400. Loss: 2.1306004524230957. Accuracy: 1.1904761904761905%
Iteration: 500. Loss: 1.3009045124053955. Accuracy: 38.80952380952381%
Iteration: 600. Loss: 1.7583307027816772. Accuracy: 54.76190476190476%
Iteration: 700. Loss: 0.14388585090637207. Accuracy: 24.047619047619047%
Iteration: 800. Loss: 1.9846148490905762. Accuracy: 44.285714285714285%
Iteration: 900. Loss: 0.543931245803833. Accuracy: 63.095238095238095%
Iteration: 1000. Loss: 0.08029985427856445. Accuracy: 68.33333333333333%
Iteration: 1100. Loss: 0.4454386234283447. Accuracy: 84.76190476190476%
Iteration: 1200. Loss: 0.07848167419433594. Accuracy: 85.71428571428571%
Iteration: 1300. Loss: 0.052376747131347656. Accuracy: 88.33333333333333%
I

In [0]:
#  torch.save(model.state_dict(), 'drive/My Drive/Datasets/roberta_state_dict_'+ str(uuid4())+'.pth')

In [31]:
dataset.tail(5)

Unnamed: 0,utterance,label
2095,Is Across the Line playing at the closest movi...,SearchScreeningEvent
2096,Which animated movies are playing in the neigh...,SearchScreeningEvent
2097,Where is They Always Return at Dawn playing,SearchScreeningEvent
2098,What is the movie schedule in the neighborhood,SearchScreeningEvent
2099,Tell me when Howling II: Your Sister Is a Were...,SearchScreeningEvent


In [0]:
## Load model

In [0]:
# model_path = 'drive/My Drive/Datasets/roberta_state_dict_7a4ff0ec-b474-4622-8a2f-88c58f364272.pth'

In [0]:
# model.load_state_dict(torch.load(model_path, map_location=device))

In [0]:
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=list(label_to_ix.keys())[pred_label]
  return prediction

In [33]:
label_to_ix.keys()

dict_keys(['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'])

In [34]:
get_reply("play radiohead song")

'PlayMusic'

In [35]:
get_reply("it is rainy in Sao Paulo")

'GetWeather'

In [36]:
get_reply("sun shinnes all day")

'PlayMusic'

In [37]:
get_reply("low humidity, high altitude")

'PlayMusic'

In [38]:
get_reply("Book tacos for me tonight")

'BookRestaurant'

In [39]:
get_reply("Book a table for me tonight")

'BookRestaurant'

In [40]:
get_reply("I want BBQ tonight")

'PlayMusic'