## Imports

In [None]:
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn
import torch.utils.data
import collections
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utils & Data prep

In [None]:
# From Dataframe to DataSet to DataLoader
data = pd.read_json("inputs/cooking/train.json")

# String to Int
vocab = list({word for sentence in data.ingredients for word in sentence})
stoi = collections.defaultdict(lambda: len(vocab),{string:integer for integer,string in enumerate(vocab)})
padIndex = len(vocab)+1

# Y => Categories
data.cuisine = data.cuisine.astype("category")
Y_unique = len(data.cuisine.cat.categories)

# convert each ingredient in each recipe to its coresponding int
data["X"] = data.ingredients.apply(lambda l: np.array([stoi[s] for s in l]))

#Dictionary which converts cuisine index to string value
itos = {i:c for i,c in enumerate(data.cuisine.cat.categories)}
    
#Split train/valid sets
np.random.seed(1)
train, val = train_test_split(data, test_size=0.15)
assert len(train) + len(val) == len(data)


class MakeDS(torch.utils.data.Dataset):
    def __init__(self,X,y):
        self.X, self.y = X,y
        
    def __len__(self): 
        return len(self.X)
    
    def __getitem__(self,index): 
        return self.X[index], self.y[index]

# Create three different datasets. fullDS contains all rows in training data
dataset = MakeDS(data.X.values, data.cuisine.cat.codes.values)
trainset = MakeDS(train.X.values, train.cuisine.cat.codes.values)
valset = MakeDS(val.X.values, val.cuisine.cat.codes.values)

# Custom collate function which takes a batch of samples and embeds them in a tensor (sequence length,batch size) 
# padded out to the max ingredient list length of the batch
def collate(samples):
    batchsize = len(samples)
    maxLen = max(len(s[0]) for s in samples)
    out = torch.zeros(maxLen,batchsize,dtype=torch.long) + padIndex
    for i,s in enumerate(samples):
        out[:len(s[0]),i] = torch.tensor(s[0],dtype=torch.long)
    return out.to(device), torch.tensor([s[1] for s in samples],dtype=torch.long).to(device)

#Create the dataloaders
batchsize = 32
trainDL = torch.utils.data.DataLoader(trainset,batchsize,collate_fn=collate,shuffle=False)
valDL = torch.utils.data.DataLoader(valset,batchsize,collate_fn=collate)
fullDL = torch.utils.data.DataLoader(dataset,batchsize,shuffle=True,collate_fn=collate)

## He init for embedding layer

In [None]:
# copied from https://github.com/fastai/fastai/blob/master/fastai/layers.py#L116, 
# implements initialization for the embedding layer
def trunc_normal_(x:torch.tensor, mean:float=0., std:float=1.) -> torch.tensor:
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

## RNN modelling

In [None]:
# Recurrent neural network (many-to-one)
class RNN(nn.Module):
    def __init__(self,vocabSize,embSize,hiddenSize,nlayers,Y_unique):
        super().__init__()
        self.hiddenSize = hiddenSize
        self.nlayers = nlayers
        # create ndimensional matrix for word embedding using pytorch
        self.ingredEmb = torch.nn.Embedding(vocabSize,embSize)
        # implement dropout
        self.embDropout = torch.nn.Dropout(0.5)
        with torch.no_grad(): trunc_normal_(self.ingredEmb.weight, std=0.01) 
        # Use He initilization on the embedding layer
        # GRU(input_size, hidden_size, num_layers, (dropout(float), bias(bool), bidirectional(bool)))
        self.ingredEnc = torch.nn.GRU(embSize,hiddenSize,nlayers,dropout=0.5)
        self.encDropout = torch.nn.Dropout(0.5)
        # linear out
        self.out = torch.nn.Linear(hiddenSize*2,Y_unique)
        
    def forward(self,inp):
        # get dims
        sl, batchsize = inp.size()
        # upadte input put it in embedding
        inp = self.embDropout(self.ingredEmb(inp))
        # get encodding and hidden
        enc,h = self.ingredEnc(inp,torch.zeros(self.nlayers,batchsize,self.hiddenSize).to(device))
    
        # using a bidrectional GRU,
        # concat the forward state to the backward state, then pass it to the output layer
        return self.out(self.encDropout(torch.cat([h[-2],h[-1]],dim=1)))


## Instantiate RNN

In [None]:
# Net(number individual tokens, 
#            embedding size, 
#            hidden size,
#            number layers, numbers possible outputs = number cuisines)
model = RNN(vocabSize= len(vocab)+2,
                   embSize = 10,
                   hiddenSize = 9,
                   nlayers = 10,
                   Y_unique = Y_unique).to(device)


#Grab a batch from the dataloader, and pass it through the model to make sure the output shape is correct
x,y = next(iter(trainDL))

model(x)

## Define training and printing func

In [None]:
#function to calculate the average accuracy of a batch

def batchAccuracy(preds,target):
    preds = torch.softmax(preds,dim=1)
    preds = torch.argmax(preds,dim=1)
    o = (preds == target).sum().item()
    return o / len(preds)

# training function

def training(model,epochs,lr,trainDL,valDL=None):
    lossFn = torch.nn.functional.cross_entropy
    optimizer = torch.optim.Adam(model.parameters(),lr=lr,amsgrad=True,weight_decay=5e-4)

    for e in tqdm_notebook(range(epochs)):
        model.train()
        with tqdm_notebook(iter(trainDL),leave=False) as t:
            bloss, n = 0.0,0
            for x,y in t:
                pred = model(x)
                loss = lossFn(pred,y)
                bloss += loss.item()
                n += 1
                t.set_postfix({"loss": bloss / n})
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print(f"Epoch {e+1} Training Set Loss: {bloss / n}")
        if valDL is not None:
            model.eval()
            with torch.no_grad():
                loss,accuracy,n =0.0,0.0,0
                for x,y in tqdm_notebook(iter(valDL),leave=False):
                    pred = model(x)
                    loss += lossFn(pred,y)
                    accuracy += batchAccuracy(pred,y)
                    n += 1
                print(f"Validation Set Loss: {loss / n}, Accuracy: {accuracy / n}")

## Train the model

In [None]:
#train (model, epochs, learning rate, dataloader)
training(model,12,1e-2,trainDL)

In [None]:
model.eval()