In [53]:
import torch
from torch import nn, optim
import numpy as np
from torch.utils.data import DataLoader
import pandas as pd
import ast

from collections import Counter
from sklearn.model_selection import train_test_split


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


In [48]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

In [42]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        train_df
    ):
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        text = train_df.str.cat(sep=' ')
        return text.split(' ')

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - 4

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+4]),
            torch.tensor(self.words_indexes[index+1:index+4+1]),
        )

In [55]:
def train(dataset, model, results_list, batch=256, max_epochs=10, sequence_length=4):
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(sequence_length)

        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()

            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
            results_list.append([epoch, batch, loss.item()])

In [50]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return words

In [51]:
df = pd.read_csv('data/RAW_recipes.csv')
recipe_df = df[["steps"]]

recipe_df["processed"] = [". ".join(ast.literal_eval(step_array)) for step_array in recipe_df.steps]

train_df, test_df = train_test_split(recipe_df["processed"], train_size=.8, test_size=.2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [56]:
dataset = Dataset(train_df)
model = Model(dataset)
results = []

train(dataset, model, results)
print(predict(dataset, model, text='Add '))

{'epoch': 0, 'batch': 0, 'loss': 11.2608060836792}
{'epoch': 0, 'batch': 1, 'loss': 11.245948791503906}
{'epoch': 0, 'batch': 2, 'loss': 11.238430976867676}
{'epoch': 0, 'batch': 3, 'loss': 11.228371620178223}
{'epoch': 0, 'batch': 4, 'loss': 11.220096588134766}
{'epoch': 0, 'batch': 5, 'loss': 11.21103286743164}
{'epoch': 0, 'batch': 6, 'loss': 11.186309814453125}
{'epoch': 0, 'batch': 7, 'loss': 11.159743309020996}
{'epoch': 0, 'batch': 8, 'loss': 11.09915542602539}
{'epoch': 0, 'batch': 9, 'loss': 10.991277694702148}
{'epoch': 0, 'batch': 10, 'loss': 10.905417442321777}
{'epoch': 0, 'batch': 11, 'loss': 10.700589179992676}
{'epoch': 0, 'batch': 12, 'loss': 10.531011581420898}
{'epoch': 0, 'batch': 13, 'loss': 10.078740119934082}
{'epoch': 0, 'batch': 14, 'loss': 9.9773530960083}
{'epoch': 0, 'batch': 15, 'loss': 9.875476837158203}
{'epoch': 0, 'batch': 16, 'loss': 9.75689697265625}
{'epoch': 0, 'batch': 17, 'loss': 9.167888641357422}
{'epoch': 0, 'batch': 18, 'loss': 9.0407295227050

{'epoch': 0, 'batch': 154, 'loss': 7.071927547454834}
{'epoch': 0, 'batch': 155, 'loss': 6.83410120010376}
{'epoch': 0, 'batch': 156, 'loss': 6.762159824371338}
{'epoch': 0, 'batch': 157, 'loss': 6.286306858062744}
{'epoch': 0, 'batch': 158, 'loss': 6.357349872589111}
{'epoch': 0, 'batch': 159, 'loss': 7.04067325592041}
{'epoch': 0, 'batch': 160, 'loss': 6.353498458862305}
{'epoch': 0, 'batch': 161, 'loss': 6.203556060791016}
{'epoch': 0, 'batch': 162, 'loss': 6.457461833953857}
{'epoch': 0, 'batch': 163, 'loss': 6.3608527183532715}
{'epoch': 0, 'batch': 164, 'loss': 6.285211086273193}
{'epoch': 0, 'batch': 165, 'loss': 6.271991729736328}
{'epoch': 0, 'batch': 166, 'loss': 5.985750675201416}
{'epoch': 0, 'batch': 167, 'loss': 6.873164653778076}
{'epoch': 0, 'batch': 168, 'loss': 6.220191955566406}
{'epoch': 0, 'batch': 169, 'loss': 6.192991733551025}
{'epoch': 0, 'batch': 170, 'loss': 6.229339599609375}
{'epoch': 0, 'batch': 171, 'loss': 6.6522016525268555}
{'epoch': 0, 'batch': 172, '

{'epoch': 0, 'batch': 306, 'loss': 6.584113121032715}
{'epoch': 0, 'batch': 307, 'loss': 6.231884956359863}
{'epoch': 0, 'batch': 308, 'loss': 6.643652439117432}
{'epoch': 0, 'batch': 309, 'loss': 6.418027877807617}
{'epoch': 0, 'batch': 310, 'loss': 5.986758708953857}
{'epoch': 0, 'batch': 311, 'loss': 6.269043445587158}
{'epoch': 0, 'batch': 312, 'loss': 6.458680152893066}
{'epoch': 0, 'batch': 313, 'loss': 6.354611873626709}
{'epoch': 0, 'batch': 314, 'loss': 6.356424331665039}
{'epoch': 0, 'batch': 315, 'loss': 6.019131660461426}
{'epoch': 0, 'batch': 316, 'loss': 6.578962326049805}
{'epoch': 0, 'batch': 317, 'loss': 6.37463903427124}
{'epoch': 0, 'batch': 318, 'loss': 6.465153694152832}
{'epoch': 0, 'batch': 319, 'loss': 6.514948844909668}
{'epoch': 0, 'batch': 320, 'loss': 6.120205402374268}
{'epoch': 0, 'batch': 321, 'loss': 6.135799884796143}
{'epoch': 0, 'batch': 322, 'loss': 6.444023609161377}
{'epoch': 0, 'batch': 323, 'loss': 7.186972141265869}
{'epoch': 0, 'batch': 324, 'l

KeyboardInterrupt: 