### Run the below cells until 'stop' to get your data processed and loaded

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
from __future__ import unicode_literals, print_function, division

import torch
import torch.nn as nn
import torchvision.datasets as dsets
from torch import optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
import re
import string


from torch.utils.data import Dataset, DataLoader


from io import open
import unicodedata
import re
import random

SOS_token = '<SOS>'
EOS_token = '<EOS>'
UNK_token = '<UNK>'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
print(torch.__version__)

1.0.1


# stop

Below this block is your responsibility! Best of luck

In [3]:
# STEP 2.5: CLEANING DATA
movie_text = open('moviedialogues/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('moviedialogues/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

lineToText = {}  # mapping of line number to text
# inputToOutput = {}
inputs = []
outputs = []
for line in movie_text:
    things = line.split("+++$+++")
#     print(things)
    if (len(things) == 5):  
#         key = re.sub("[^0-9]", "", things[0])
        val = things[4].translate(str.maketrans('', '', string.punctuation))
#         lineToText[int(key)] = val
        lineToText[things[0].replace(" ", "")] = val

        
# print(lineToText[295])


for conversation in conv_lines:
    things = conversation.split("+++$+++")
    if (len(things) == 4):
        convo = things[3]
        convo = [x.strip() for x in convo.split(',')]
        convo[0] = convo[0].replace("[", "")
        convo[len(convo) - 1] = convo[len(convo) - 1].replace("]", "")
        for index in range(0, len(convo)):
            convo[index] = convo[index].replace("'", "")
#         print(convo)
        #convo is a string, need to split by comma, remove first [ and last ], and then do this
        for i in range(0, len(convo) - 1):
#             inputSentenceIndex = re.sub("[^0-9]", "", convo[i])
#             outputSentenceIndex = re.sub("[^0-9]", "", convo[i + 1])    
            #print(convo[i])
            inputSentenceIndex = convo[i]
            outputSentenceIndex = convo[i + 1]
            if (inputSentenceIndex in lineToText) and (outputSentenceIndex in lineToText):
                inputs.append(lineToText[inputSentenceIndex])
                outputs.append(lineToText[outputSentenceIndex])
                
            
print(len(inputs))
# for i in range(0, 10):
#     print(inputs[i])
#     print(outputs[i])
#     print("~~~~~")

221616


In [4]:
# '''
# Dataset Class
# '''

class ConvoDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs
        
    def __getitem__(self, index):
        iwords = self.inputs[index].split(" ")
        owords = self.outputs[index].split(" ")
        
        iwords = list(filter(None, iwords))
        owords = list(filter(None, owords))
        
#         return iwords, owords
        _input = torch.tensor([model.vocab[word].index if word in model.vocab else model.vocab[UNK_token].index for word in iwords], dtype=torch.long)
        _output = torch.tensor([model.vocab[word].index if word in model.vocab else model.vocab[UNK_token].index for word in owords], dtype=torch.long)
        
        return _input, _output 
    
    def __len__(self):
        return len(self.inputs)
        

In [5]:
training_input = inputs[0:16000]
training_output = outputs[0:16000]
testing_input = inputs[16000:]
testing_output = outputs[16000:]

In [6]:
# '''
# MAKE DATA ITERABLE
# '''
params = {'batch_size' : 16,
         'shuffle': True,
         'num_workers': 1}

training_set = ConvoDataset(training_input, training_output)
training_generator = DataLoader(training_set, **params)

testing_set = ConvoDataset(testing_input, testing_output)
testing_generator = DataLoader(testing_set, **params)

In [7]:
# '''
# STEP 2.75: CREATE EMBEDDINGS
# '''
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec
model = api.load("glove-wiki-gigaword-50")

# model = api.load("word2vec-google-news-300")
#model = Word2Vec(inputs,size=100, window=5, min_count=5, workers=4) # download dataset to replace inputs
#model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)
#gensim model created
import torch
print(model.vector_size)

model.add(['<SOS>', '<EOS>', '<UNK>'], [np.random.rand(50), np.random.rand(50), np.random.rand(50)])
weights = torch.FloatTensor(model.wv.vectors)
embedding = nn.Embedding.from_pretrained(weights)



50




In [8]:
# '''
# STEP 3: CREATE MODEL CLASS
# '''

class EncoderRNN(nn.Module):
    def __init__(self, hidden_sz):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
#         self.embedding = nn.Embedding(input_sz, hidden_sz)
        self.embedding = embedding
        self.gru = nn.GRU(hidden_sz, hidden_sz)
        
    def forward(self, _input, hidden):
        output = self.embedding(_input).view(1, 1, -1) # the -1 infers the dimension, the 1, 1 is a 1D vector
        output, hidden = self.gru(output, hidden)
        return output, hidden
        
    def hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    
    
class DecoderRNN(nn.Module):
    def __init__(self, hidden_sz, output_sz):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_sz

        #self.embedding = nn.Embedding(output_sz, hidden_sz)
        self.embedding = embedding
        self.gru = nn.GRU(hidden_sz, hidden_sz)
        self.out = nn.Linear(hidden_sz, output_sz)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, _input, hidden):
        output = self.embedding(_input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

    
class Attention(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=20):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [9]:
'''
STEP 4: INSTANTIATE MODEL CLASS
'''

#in translation example, first arg for encoder and second arg for attnetion is num of words in a sentence? idk if we
#should be having that or something else?
# model = FeedForwardModel()
# hidden_size = 300
hidden_size = 50
vocab_size = 400003
encoder1 = EncoderRNN(hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, vocab_size).to(device)

# attn_decoder1 = Attention(hidden_size, len(outputs), dropout_p=0.1).to(device)
# attn_decoder1 = Attention(hidden_size, len(outputs), dropout_p=0.1)

# trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

In [10]:
!nvidia-smi

Sun Apr 28 00:08:16 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Quadro M4000        Off  | 00000000:00:05.0 Off |                  N/A |
| 46%   30C    P0    42W / 120W |    544MiB /  8126MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [11]:
import time
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [12]:
'''
STEP 7: TRAIN THE MODEL
'''
# we want to call torch.tensor() on a list of indexes
# each sentence becomes a list of indexes --> an input tensor that we put into train()

MAX_LENGTH = 20
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.hidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = min(input_tensor.size(0), 20) # DON'T USE MIN ITS HACKY AF
    target_length = min(target_tensor.size(0), 20)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[model.vocab[SOS_token].index]], device=device)

    decoder_hidden = encoder_hidden
    
    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
#         decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)        
        decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input
    
    
        inp = decoder_output #torch.tensor([[torch.max(decoder_output, 1)[1].item()]], dtype=torch.float).to(device)
        tar = torch.tensor([target_tensor[di].item()], dtype=torch.long).to(device)
            
        loss += criterion(inp, tar)
#         if decoder_input.item() == model.vocab[EOS_token].index:
#             break
    if isinstance(loss, int):
        print(input_tensor, target_tensor)
    else:
        loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
    if isinstance(loss, int):
        return 0
    else:
        return loss.item() / target_length
    
    
def indexize(string):
    """
    string: sentence to indexize into word indices
    
    Returns: a tensor of word indices
    """
    words = string.split(" ")
    words = list(filter(None, words))
    words.append("<EOS>")
    indexed = torch.tensor([model.vocab[word].index if word in model.vocab else model.vocab[UNK_token].index for word in words], dtype=torch.long)
    return indexed
    
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    #pass in our data here?
    criterion = nn.CrossEntropyLoss()
    start=time.time()
#     for _input, _output in training_generator:
    for epoch in range(0, 10):
        print("epoch: " + str(epoch))

        for idx in range(len(training_input)):
#split input into array and make into pytorch tensor

#         input_words = _input.split(" ")
#         input_indexes = [model.vocab[word].index for word in input_words]
#         input_tensor = torch.tensor(input_indexes, dtype=torch.long)
        
#         output_words = _output.split(" ")
#         output_indexes = [model.vocab[word].index for word in output_words]
#         output_tensor = torch.tensor(output_indexes, dtype=torch.long)
            _input = indexize(training_input[idx])
            _output = indexize(training_output[idx])
            
            input_tensor = _input.to(device)
            target_tensor = _output.to(device)
            loss_f = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss_f
            plot_loss_total += loss_f

            if idx % 100 == 0:
                print_loss_avg = print_loss_total / print_every
                end = time.time()
                print('training idx:[{}/{}], Loss: {:.4f}, Time: {:.2f}'
                  .format(idx, len(training_input), print_loss_total, end-start))
#             print('%s (%d %d%%) %.4f' % (timeSince(start, idx / n_iters),
#                                          idx, idx / n_iters * 100, print_loss_avg))
                print_loss_total = 0

            if idx % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

        showPlot(plot_losses)
    
        torch.save(encoder1.state_dict(), 'current encoder')
        torch.save(decoder1.state_dict(), 'current decoder')



In [None]:
trainIters(encoder1, decoder1, 10000)

epoch: 0
training idx:[0/16000], Loss: 6.1912, Time: 0.22
training idx:[100/16000], Loss: 445.9126, Time: 13.98
training idx:[200/16000], Loss: 484.6305, Time: 27.78
training idx:[300/16000], Loss: 464.8628, Time: 40.21
training idx:[400/16000], Loss: 400.0831, Time: 51.02
training idx:[500/16000], Loss: 471.8086, Time: 64.96
training idx:[600/16000], Loss: 477.3969, Time: 78.75
training idx:[700/16000], Loss: 457.9997, Time: 92.32
training idx:[800/16000], Loss: 449.6085, Time: 106.20
training idx:[900/16000], Loss: 459.4976, Time: 121.57
training idx:[1000/16000], Loss: 446.3420, Time: 134.95
training idx:[1100/16000], Loss: 452.7066, Time: 149.25
training idx:[1200/16000], Loss: 494.2113, Time: 163.32
training idx:[1300/16000], Loss: 490.6902, Time: 179.27
training idx:[1400/16000], Loss: 503.5374, Time: 195.70
training idx:[1500/16000], Loss: 517.3355, Time: 213.87
training idx:[1600/16000], Loss: 514.0556, Time: 232.25
training idx:[1700/16000], Loss: 495.9643, Time: 248.97
traini

training idx:[14500/16000], Loss: 506.9247, Time: 2056.18
training idx:[14600/16000], Loss: 459.6144, Time: 2071.20
training idx:[14700/16000], Loss: 489.0613, Time: 2089.28
training idx:[14800/16000], Loss: 473.1476, Time: 2104.69
training idx:[14900/16000], Loss: 488.1450, Time: 2120.27
training idx:[15000/16000], Loss: 474.4780, Time: 2138.34
training idx:[15100/16000], Loss: 431.8643, Time: 2152.26
training idx:[15200/16000], Loss: 468.9610, Time: 2168.52
training idx:[15300/16000], Loss: 496.6872, Time: 2184.70
training idx:[15400/16000], Loss: 458.6644, Time: 2198.22
training idx:[15500/16000], Loss: 456.0121, Time: 2210.19
training idx:[15600/16000], Loss: 497.7836, Time: 2224.74
training idx:[15700/16000], Loss: 481.5722, Time: 2239.46
training idx:[15800/16000], Loss: 447.9803, Time: 2254.36
training idx:[15900/16000], Loss: 477.4915, Time: 2268.00
epoch: 1
training idx:[0/16000], Loss: 506.5774, Time: 2286.68
training idx:[100/16000], Loss: 439.6130, Time: 2300.55
training id

In [14]:
def evaluate(encoder, decoder, sentence, max_length=20):
    with torch.no_grad():
        input_tensor = indexize(sentence).to(device)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.hidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
#             encoder_outputs[ei] += encoder_output[0, 0]
        decoder_input = torch.tensor([[model.vocab[SOS_token].index]],device=device)  # SOS
        decoder_hidden = encoder_hidden
        decoded_words = []
#         decoder_attentions = torch.zeros(max_length, max_length)
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
#             decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            print(topv)
            if topi.item() == 400001:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(model.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words #, decoder_attentions[:di + 1]

In [16]:
evaluate(encoder1, decoder1, "hello how are you")

tensor([[-12.3147]], device='cuda:0')
tensor([[-12.2791]], device='cuda:0')
tensor([[-12.3090]], device='cuda:0')
tensor([[-12.3199]], device='cuda:0')
tensor([[-12.3953]], device='cuda:0')
tensor([[-12.3737]], device='cuda:0')
tensor([[-12.3616]], device='cuda:0')
tensor([[-12.3071]], device='cuda:0')
tensor([[-12.2213]], device='cuda:0')
tensor([[-12.3046]], device='cuda:0')
tensor([[-12.3837]], device='cuda:0')
tensor([[-12.3477]], device='cuda:0')
tensor([[-12.3476]], device='cuda:0')
tensor([[-12.2501]], device='cuda:0')
tensor([[-12.2379]], device='cuda:0')
tensor([[-12.2039]], device='cuda:0')
tensor([[-12.3406]], device='cuda:0')
tensor([[-12.2950]], device='cuda:0')
tensor([[-12.2031]], device='cuda:0')
tensor([[-12.2358]], device='cuda:0')


['boudou',
 'inconsolable',
 'super-strong',
 'marquessate',
 'born-again',
 'besseghir',
 '32.75',
 'inconsolable',
 'redivide',
 'beetz',
 'awesomely',
 'antigens',
 'atriplex',
 'chubar',
 'euphorbiaceae',
 'neuman',
 'half-timbered',
 'half-timbered',
 'aquib',
 'rigoberto']

In [16]:
indexize("They do not!")

tensor([400002,     88, 400002, 400001])