In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import numpy as np
from collections import Counter
import os
from argparse import Namespace

In [36]:
filepath = 'user_data/' + 'John' + '/' + 'John' + '_messages.json'
f = open(filepath)

# returns JSON object as 
# a dictionary
data = json.load(f)
message = []
# Iterating through the json
# list
for i in data:
    try:
        if i['media'] == None:
            message.append(i['message'])
    except:pass
# Closing file
f.close()
f = open("output.txt","a")
for i in data:
    try:
        if i['media'] == None:
            f.write(i['message'] + '\n')
    except:pass
f.close()


In [7]:
flags = Namespace(
    train_file='output.txt',
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)

In [35]:
def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r') as f:
        text = f.read()
    text = text.split()

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text



In [43]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [44]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)
        
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state
    
    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))


In [45]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer

In [46]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(
        flags.train_file, flags.batch_size, flags.seq_size)

    net = RNNModule(n_vocab, flags.seq_size,
                    flags.embedding_size, flags.lstm_size)
    net = net.to(device)

    criterion, optimizer = get_loss_and_train_op(net, 0.01)

    iteration = 0
    epochs = 200
    for e in range(epochs):
        batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
        state_h, state_c = net.zero_state(flags.batch_size)
        
        # Transfer data to GPU
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1
            
            # Tell it we are in training mode
            net.train()

            # Reset all gradients
            optimizer.zero_grad()

            # Transfer data to GPU
            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)

            logits, (state_h, state_c) = net(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss_value = loss.item()

            # Perform back-propagation
            loss.backward()
            
            _ = torch.nn.utils.clip_grad_norm_(net.parameters(), flags.gradients_norm)

            # Update the network's parameters
            optimizer.step()
            
            if iteration % 1000 == 0:
                print('Epoch: {}/{}'.format(e, epochs),
                      'Iteration: {}'.format(iteration),
                      'Loss: {}'.format(loss_value))

            if iteration % 10000 == 0:
                predict(device, net, flags.initial_words, n_vocab,
                        vocab_to_int, int_to_vocab, top_k=5)

            if iteration % 50000 == 0:
                torch.save(net.state_dict(),'model-{}.pth'.format(iteration))

    torch.save(net.state_dict(),'final-model.pth')

In [47]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    
    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))

In [48]:
main()

Vocabulary size 11729
Epoch: 6/200 Iteration: 1000 Loss: 4.091312408447266
Epoch: 12/200 Iteration: 2000 Loss: 3.096794843673706
Epoch: 19/200 Iteration: 3000 Loss: 2.4859397411346436
Epoch: 25/200 Iteration: 4000 Loss: 2.2266616821289062
Epoch: 32/200 Iteration: 5000 Loss: 2.08569073677063
Epoch: 38/200 Iteration: 6000 Loss: 1.8503631353378296
Epoch: 45/200 Iteration: 7000 Loss: 1.7102843523025513
Epoch: 51/200 Iteration: 8000 Loss: 1.6224339008331299
Epoch: 58/200 Iteration: 9000 Loss: 1.5227195024490356
Epoch: 64/200 Iteration: 10000 Loss: 1.4663926362991333
I am free on Sunday prob remember this morning, and talk for my comment ur thoughts It's just read it til 5/6 dumb Lol Omrahn please Screenshots Does Jalil Wdym? guys just going back for one to us, gross sides I genuinely thought u was telling the show but it's more accommodating already At eat Lol not even avg Why would I still play any game were watching this is my limits history that's waiting for deleting Brendan out now I'm

In [50]:
net = torch.load('final-model.pth')

OrderedDict([('embedding.weight',
              tensor([[ 0.7492, -0.0891,  0.8398,  ...,  1.8762, -1.0956, -0.6210],
                      [-0.4163, -0.5728, -0.8769,  ..., -3.3641,  1.3001, -1.9331],
                      [ 0.0447, -0.0503,  1.5316,  ..., -0.2947, -0.5491, -0.5677],
                      ...,
                      [-0.5741, -1.0428, -0.0182,  ..., -0.4536, -0.7862, -1.8054],
                      [-0.1431,  0.1501, -0.3431,  ...,  0.0532,  1.3245,  2.3071],
                      [ 0.0885,  0.8990, -1.2375,  ...,  0.2818,  0.0864, -0.3674]])),
             ('lstm.weight_ih_l0',
              tensor([[ 0.3429,  1.5110, -0.1029,  ..., -3.5115, -1.5791, -2.1300],
                      [ 1.0831, -3.1639,  0.0602,  ..., -0.7969, -1.3242,  1.2067],
                      [-1.1481, -0.9156,  0.8047,  ...,  0.1013, -0.3206,  1.4066],
                      ...,
                      [-2.6155, -0.1002,  0.4741,  ..., -3.3500,  1.4361,  0.6825],
                      [-0.3810,  0