In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from collections import Counter
import os
from argparse import Namespace


flags = Namespace(
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    predict_top_k=5,
    checkpoint_path='checkpoint',
)

In [0]:
import re

def process_to_tokens(text):
    text = (' '.join(text.split(r'\n'))).lower()
    
    while True:
        new_s = re.sub("\\.\\.+", ".", text)
        if new_s == text:
            break
        text = new_s
    print(text)
    word = ''
    number = ''
    
    tokens = []
    
    is_whitespace = re.compile('\s')
    for sym in text:
        if sym.isalpha():
            if number:
                tokens.append(number)
                number = ''
            word += sym
            continue
        if sym.isnumeric():
            if word:
                tokens.append(word)
                word = ''
            number += sym
            continue
        if number:
                tokens.append(number)
        if word:
                tokens.append(word)
        number, word = '', ''
        if is_whitespace.match(sym) or sym == r'\\' or sym == "'":
            continue
        tokens.append(sym)
    if number:
                tokens.append(number)
    if word:
            tokens.append(word)
    return tokens

In [0]:
process_to_tokens("titlestart Hi!! My name is ANNa)\n titleend I don't like you... postend")

titlestart hi!! my name is anna)
 titleend i don't like you. postend


['titlestart',
 'hi',
 '!',
 '!',
 'my',
 'name',
 'is',
 'anna',
 ')',
 'titleend',
 'i',
 'don',
 't',
 'like',
 'you',
 '.',
 'postend']

In [0]:
def get_data_from_text(text, batch_size, seq_size):
    text = process_to_tokens(text)

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

In [0]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [0]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state
    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))

In [0]:
class RNNBi(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNBi, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True,
                            bidirectional=True)
        self.dense = nn.Linear(lstm_size, n_vocab)
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state
    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import json

posts = []

with open('/content/drive/My Drive/DL project/data_jokes_clean.json', 'r') as fin:
  posts = json.load(fin)['posts']

In [0]:
len(posts)

83359

In [0]:
text = ""

for post in posts:
    if len(post['text']) < 100:
        text += " titlestart " + post['title'] + " titleend " + post['text'] + " postend "

In [0]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_text(
    text, flags.batch_size, flags.seq_size)
print(device)
net = RNNBi(n_vocab, flags.seq_size,
                flags.embedding_size, flags.lstm_size)
net = net.to(device)

criterion, optimizer = get_loss_and_train_op(net, 0.01)

iteration = 0

for e in range(50):
    batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
    state_h, state_c = net.zero_state(flags.batch_size)
        
    # Transfer data to GPU
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for x, y in batches:
        iteration += 1
        net.train()
        optimizer.zero_grad()
        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)

        logits, (state_h, state_c) = net(x, (state_h, state_c))
        loss = criterion(logits.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        loss_value = loss.item()

        loss.backward()

        _ = torch.nn.utils.clip_grad_norm_(
            net.parameters(), flags.gradients_norm)

        optimizer.step()
        if iteration % 100 == 0:
            print('Epoch: {}/{}'.format(e, 200),
                  'Iteration: {}'.format(iteration),
                  'Loss: {}'.format(loss_value))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Vocabulary size 27867
cuda


RuntimeError: ignored

In [0]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        next_word = int_to_vocab[choice]
        words.append(next_word)
        if next_word == "postend":
            break

    print(' '.join(words))

In [0]:
for _ in range(10):
  predict(device, net, ['titlestart'], n_vocab, vocab_to_int, int_to_vocab)

titlestart two men have two fives is chirp , but it s been posted to do you , but it was a fan o paw ? postend
titlestart two cannibals standing in to eat him , and says , " hey to go on the inside ! " i ll tell me . i don ’ saved . i ll let her go on me as i m sure how do i eat your hole . titleend " but the other is it " fuck " postend
titlestart i don t have the last night . titleend but a man is in the title postend
titlestart two cannibals standing ear ? titleend you kip . " - 1 . " postend
titlestart the thing i was in the end of his ex and the first time . they never get a fangover on pinocchio s face titleend " it s a great personality ! titleend i guess it s okay . you can do it , but i think i don t want a good joke ? i have been suspected , i have no matter i don not sure about the time i can get the wrong thing all . it would never get a hard drive investment postend
titlestart i have the best news titleend because i was just a little chewie , then you can say that they re 

In [0]:
iteration = 0

for e in range(50):
    batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
    state_h, state_c = net.zero_state(flags.batch_size)
        
    # Transfer data to GPU
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for x, y in batches:
        iteration += 1
        net.train()
        optimizer.zero_grad()
        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)

        logits, (state_h, state_c) = net(x, (state_h, state_c))
        loss = criterion(logits.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        loss_value = loss.item()

        loss.backward()

        _ = torch.nn.utils.clip_grad_norm_(
            net.parameters(), flags.gradients_norm)

        optimizer.step()
        if iteration % 100 == 0:
            print('Epoch: {}/{}'.format(e, 200),
                  'Iteration: {}'.format(iteration),
                  'Loss: {}'.format(loss_value))

Epoch: 0/200 Iteration: 100 Loss: 4.845123767852783
Epoch: 0/200 Iteration: 200 Loss: 4.6205902099609375


KeyboardInterrupt: ignored

In [0]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import random
import torch.optim as opt

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import torchtext
from torchtext.data.utils import get_tokenizer

In [0]:
bptt = 50

def generate_dataset(texts, word_to_id_dict, id_to_word_dict, next_id):
  result = []
  lang_model = spacy.load('en', disable=['tagger', 'parser', 'ner'])
  was = set()
  for i, sentence in enumerate(texts):
    sentence = sentence.lower().rstrip('\n')
    sentence = [tok.text for tok in lang_model.tokenizer(sentence) if not tok.text.isspace()]
    lemmzed = " ".join(sorted([wordnet_lemmatizer.lemmatize(x) for x in sentence]))
    if lemmzed in was:
      continue
    was.add(lemmzed)
    sentence = ["<sos>"] + sentence
    if len(sentence) > bptt or len(sentence) < 7:
      continue
    sentence += ['<pad>'] * max(0, bptt - len(sentence))
      
    for word in sentence:
      if not word in word_to_id_dict:
        word_to_id_dict[word] = next_id
        id_to_word_dict[next_id] = word
        next_id += 1
      
    sentence = [word_to_id_dict[word] for word in sentence]
    freq_list.update(sentence)
      
    sentence_segments = [np.array(sentence[i:i + bptt]) for i in range(len(sentence) - bptt + 1)]

    result.append(sentence_segments)
  return result, next_id