In [None]:
#### CRITICAL - ENABLE GPU 

import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader


## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pytorch-transformers

In [None]:
## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [None]:
## Check if Cuda is Available
print(torch.cuda.is_available())

In [None]:
## Install PyTorch-Transformer

In [None]:
!pip install -U pytorch-transformers

In [None]:
## Importing Datasets

In [None]:
'''
Important Step - Make sure you upload the data file to the exact location below. If you uploaded correctlt, the following command will run
'''

!ls drive/'My Drive'/2017-06-custom-intent-engines

In [None]:
'''
Create the Dataset Path
'''


dataset_path = "drive/My Drive/2017-06-custom-intent-engines/"

In [None]:

'''
***Explain*** Summarize, in bullet points, what is the code doing?. 
'''

dataset = pd.DataFrame(columns = ['utterance', 'label'])
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open(dataset_path + intent + "/train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Class: {}, # utterances: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'utterance': text, 'label': intent}, ignore_index=True)
dataset.tail()

In [None]:
'''
Assigning an Index to each intent. We will use this later
'''

'''
***Explain*** Why do we convert labels to indexes?. 
'''

label_to_ix = {}
for label in dataset.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

In [None]:
## Loading RoBERTa classes

In [None]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

In [None]:
'''
Loading Pretrained tokenizer and instantiating the model from settings in config
'''

'''
***Explain*** : a. What is a tokenizer? b. What is special about the following tokenizer?. 
'''

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

'''
***Explain*** :  What is the next line doing?
'''

model = RobertaForSequenceClassification(config)

In [None]:
## Feature Preparation

In [None]:
'''
Some important Feature Engineering
'''

'''
***Explain*** : What are the implications for setting  include_CLS_token = True, include_SEP_token = True ?
'''

def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [None]:
msg = "My dog is cute!"
prepare_features(msg)

In [None]:
## Dataset Loader Classes

In [None]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.utterance[index]
        label = self.data.label[index]
        X, _  = prepare_features(utterance)
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [None]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [None]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

In [None]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [None]:
training_set.__getitem__(0)[0].shape

In [None]:
model(training_set.__getitem__(0)[0])

In [None]:
## Training Params

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [None]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [None]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [None]:
'''
Instantiate the Loss
'''
'''
***Explain*** why cross entropy loss?, also print the model and explain why are not we using softmax at the end?
'''
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [None]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(output.shape)

In [None]:
torch.__version__

In [None]:
'''
Actually train the model with train data
'''
'''
***Explain*** the Training Code Chunk in detail. Especially what is torch.max() doing here?
'''



max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

In [None]:
'''
***Explain*** what is the get_reply function doing?
'''
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=list(label_to_ix.keys())[pred_label]
  return prediction

In [None]:
label_to_ix.keys()

In [None]:
'''Different text sentences pass to the model'''

get_reply("play radiohead song")

In [None]:
get_reply("it is rainy in Sao Paulo")

In [None]:
get_reply("sun shinnes all day")

In [None]:
get_reply("low humidity, high altitude")

In [None]:
get_reply("Book tacos for me tonight")

In [None]:
get_reply("Book a table for me tonight")

In [None]:
get_reply("I want BBQ tonight")