In [0]:
'''
Exercise - 1: I have provided sparse comments. Please provide detailed comments using what you have understood about Transformers. Especially
the "Roberta" framework in particular.
'''

In [0]:
#### CRITICAL - ENABLE GPU (Runtime > Change Runtime type > GPU)

In [1]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader


## Mount Drive into Colab
'''
We are dealing here with a massive dataset. We need to mount the google drive.
Once you run the code below follow google's prompts to sucessfully mount the drive

'''
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
'''
Install this
'''

!pip install pytorch-transformers

Collecting pytorch-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |█▉                              | 10kB 13.7MB/s eta 0:00:01[K     |███▊                            | 20kB 4.5MB/s eta 0:00:01[K     |█████▋                          | 30kB 6.5MB/s eta 0:00:01[K     |███████▍                        | 40kB 6.0MB/s eta 0:00:01[K     |█████████▎                      | 51kB 7.3MB/s eta 0:00:01[K     |███████████▏                    | 61kB 8.6MB/s eta 0:00:01[K     |█████████████                   | 71kB 9.8MB/s eta 0:00:01[K     |██████████████▉                 | 81kB 10.9MB/s eta 0:00:01[K     |████████████████▊               | 92kB 12.1MB/s eta 0:00:01[K     |██████████████████▋             | 102kB 9.7MB/s eta 0:00:01[K     |████████████████████▍           | 112kB 9.7MB/s eta 0:00:01[K     |██████████████████████▎   

In [0]:
## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [4]:
## Make Sure Cuda is Available
print(torch.cuda.is_available())

True


In [0]:
'''
Important Step - Make sure you upload the data file to the exact location below. If you uploaded correctlt, the follwoing command will run
'''

!ls drive/'My Drive'/2017-06-custom-intent-engines

AddToPlaylist	GetWeather  RateBook   SearchCreativeWork
BookRestaurant	PlayMusic   README.md  SearchScreeningEvent


In [0]:
'''
Create the Dataset Path
'''

dataset_path = "drive/My Drive/2017-06-custom-intent-engines/"

In [10]:
'''
Code to Prep Data and see how it looks
'''


dataset = pd.DataFrame(columns = ['utterance', 'label'])
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open(dataset_path + intent + "/train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Class: {}, # utterances: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'utterance': text, 'label': intent}, ignore_index=True)
dataset.tail()

Class: AddToPlaylist, # utterances: 300
Class: BookRestaurant, # utterances: 300
Class: GetWeather, # utterances: 300
Class: PlayMusic, # utterances: 300
Class: RateBook, # utterances: 300
Class: SearchCreativeWork, # utterances: 300
Class: SearchScreeningEvent, # utterances: 300


Unnamed: 0,utterance,label
2095,Is Across the Line playing at the closest movi...,SearchScreeningEvent
2096,Which animated movies are playing in the neigh...,SearchScreeningEvent
2097,Where is They Always Return at Dawn playing,SearchScreeningEvent
2098,What is the movie schedule in the neighborhood,SearchScreeningEvent
2099,Tell me when Howling II: Your Sister Is a Were...,SearchScreeningEvent


In [11]:
'''
Assigning an Index to each intent. We will use this later
'''

label_to_ix = {}
for label in dataset.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

{'AddToPlaylist': 0,
 'BookRestaurant': 1,
 'GetWeather': 2,
 'PlayMusic': 3,
 'RateBook': 4,
 'SearchCreativeWork': 5,
 'SearchScreeningEvent': 6}

In [12]:
'''
Loading Configurations
'''

config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

{
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 7,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [13]:
'''
Loading Pretrained tokenizer and instantiating the model from settings in config
'''

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

100%|██████████| 898823/898823 [00:00<00:00, 2100798.15B/s]
100%|██████████| 456318/456318 [00:00<00:00, 1330732.33B/s]


In [0]:
'''
Some important Feature Engineering
'''


def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [0]:
'''
Wrapping the labels and data together in a Class.
'''

class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.utterance[index]
        label = self.data.label[index]
        X, _  = prepare_features(utterance)
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [0]:
'''
Test Train Split
'''

train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [0]:
'''
Train /Test sets ready
'''

training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [0]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [0]:
'''
Pull data into dataloader for efficiency
'''
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [0]:
'''
Instantiate the Loss
'''

loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [0]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(output.shape)

torch.Size([1, 7])


In [0]:
'''
Actually train the model with train data
'''


max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 1.8607558012008667. Accuracy: 1.1904761904761905%
Iteration: 100. Loss: 1.6472851037979126. Accuracy: 2.380952380952381%
Iteration: 200. Loss: 1.9056799411773682. Accuracy: 0.23809523809523808%
Iteration: 300. Loss: 1.5030757188796997. Accuracy: 0.23809523809523808%
Iteration: 400. Loss: 1.7159450054168701. Accuracy: 71.66666666666667%
Iteration: 500. Loss: 1.5446991920471191. Accuracy: 22.38095238095238%
Iteration: 600. Loss: 2.1177279949188232. Accuracy: 37.61904761904762%
Iteration: 700. Loss: 1.4603145122528076. Accuracy: 16.428571428571427%
Iteration: 800. Loss: 1.9079666137695312. Accuracy: 36.666666666666664%
Iteration: 900. Loss: 0.1386106014251709. Accuracy: 38.80952380952381%
Iteration: 1000. Loss: 0.9919295310974121. Accuracy: 79.76190476190476%
Iteration: 1100. Loss: 0.14926362037658691. Accuracy: 91.9047619047619%
Iteration: 1200. Loss: 0.08057212829589844. Accuracy: 90.47619047619048%
Iteration: 1300. Loss: 0.12791872024536133. Accuracy: 86.

In [0]:
'''
Save the Model

Note - I have provided the trained model to you. Use that model to answer questions below.
'''

torch.save(model.state_dict(), 'drive/My Drive/2017-06-custom-intent-engines/roberta_state_dict_'+ str(uuid4())+'.pth')

In [0]:
'''
Note - 

1. Use the model I have given you, call it H
2. Upload H to the exact path /My Drive/ in your google drive
'''

model_path = 'drive/My Drive/2017-06-custom-intent-engines/roberta_state_dict_2316b155-b288-4782-a927-1d3e7c02c968.pth'

In [18]:
'''
Loading State dictionaries
'''

model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [0]:
'''
Exercise 1.

1. Come up with a function that uses the model to take a sentence as input 
2. Score the sentence using the model and generate a prediction
3. Return the prediction intent from the model

Done correctly, you have to write 4 lines

'''

def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    #Send Input Message to CUDA
    #Line - 1
  #Use the model to generate probability vector  
  #Line - 2
  print(output)

  #Use torch.max function to return the prediction index with highest probability. Check how to use torch.max?
  #Line - 3

  #Match prediction index to actual prediction label. Use the label_to_ix = {} dictionary we created earlier.
  #Line - 4

  return prediction

In [0]:
'''
Exercise 2. - Test your function on the following sentences
'''

In [20]:
get_reply("sun shinnes all day")

tensor([[-0.3798, -2.3614,  0.0549,  4.1121,  1.2865,  1.3850, -2.7650]],
       device='cuda:0', grad_fn=<AddmmBackward>)


'PlayMusic'

In [0]:
get_reply("it is rainy in Sao Paulo")

'GetWeather'

In [0]:
get_reply("play radiohead song")

'PlayMusic'

In [0]:
get_reply("Book tacos for me tonight")

'BookRestaurant'

In [0]:
get_reply("Book a table for me tonight")

'BookRestaurant'

In [0]:
get_reply("I want BBQ tonight")

'PlayMusic'

In [0]:
'''
Exercise - 3 [Open Ended]

How would you use this model in a Chat Bot?

'''