# Assignment 7

Delelop language model, which generates death metal band names.  
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.  
You are free to use any other data, but the most easy way is just to take the band name column.

Your language model should be char-based autogression RNN.  
Text generation should be terminated when either max length is reached or terminal symbol is generated.  

<img src="images/example.png">

<img src="images/example2.png">

Different band names can be generated by:  
1. init $h_0$ as random vector from some probabilty distribution.
2. sampling over tokens at each timestep with probability = softmax 

Calculate perplexity for your model = your objective quality metric.  
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.  

In [68]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy
from tqdm import tqdm_notebook

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, BPTTIterator
from torchtext.datasets import LanguageModelingDataset
from sklearn.model_selection import train_test_split


SEED = 42
np.random.seed(SEED)

In [37]:
df = pd.read_csv('bands.csv')
df.head()

Unnamed: 0,id,name,country,status,formed_in,genre,theme,active
0,1,('M') Inc.,United States,Unknown,2009.0,Death Metal,,2009-?
1,2,(sic),United States,Split-up,1993.0,Death Metal,,1993-1996
2,3,.F.O.A.D.,France,Active,2009.0,Death Metal,Life and Death,2009-present
3,4,100 Suns,United States,Active,2004.0,Death Metal,,2004-present
4,5,12 Days of Anarchy,United States,Split-up,1998.0,Death Metal,Anarchy,1998-2002


In [53]:
df['name'] = df['name'] + '\n'

In [54]:
text = df['name'].sum()

In [55]:
with open('text.txt', 'w', encoding='utf-8') as f:
    f.write(text)

In [50]:
TEXT = ReversibleField(use_vocab=True, 
             batch_first=True,
             tokenize=list,
             eos_token='<eos>',
             lower=True,
            )

In [58]:
text_1 = LanguageModelingDataset('text.txt',
                                             text_field=TEXT,
                                             newline_eos=True)

In [59]:
TEXT.build_vocab(text_1)
len(TEXT.vocab.itos)

298

In [60]:
TEXT.vocab.itos[:10]

[' UNK ', '<pad>', '<eos>', ' ', 'e', 'a', 'r', 'i', 'o', 't']

In [None]:
train, test = train_test_split(text_1, test_size=0.2, random_state=42)

### Как показала практика, на train и test лучше делить изначальную таблицу

In [138]:
df = pd.read_csv('bands.csv')
df.head()
df['name'] = df['name'] + '\n'
train, test = train_test_split(df, test_size=0.3, random_state=42)
train = train['name'].sum()
test = test['name'].sum()

with open('train.txt', 'w', encoding='utf-8') as f:
    f.write(train)
    
with open('test.txt', 'w', encoding='utf-8') as f:
    f.write(test)

In [139]:
TEXT = ReversibleField(use_vocab=True, 
             batch_first=True,
             tokenize=list,
             eos_token='<eos>',
             lower=False,
            )

train, test = LanguageModelingDataset.splits(path='.',
                                             train='train.txt',
                                             test='test.txt',
                                             text_field=TEXT,
                                             newline_eos=True)

TEXT.build_vocab(train)
len(TEXT.vocab.itos)

322

In [140]:
TEXT.vocab.itos[:10]

[' UNK ', '<pad>', '<eos>', 'e', 'a', 'r', 'o', 'i', 'n', 't']

In [141]:
class MyModel(nn.Module):
    
        def __init__(self, vocab_size, embed_size, hidden_size):
            super(MyModel, self).__init__()
        
            self.encoder = nn.Embedding(vocab_size, hidden_size)
            self.rnn = nn.LSTM(input_size=embed_size,
                               hidden_size=hidden_size,
                               bidirectional=True,
                               batch_first=True,
                              )
            self.fc = nn.Linear(hidden_size*2, vocab_size)
            self.hidden_size = hidden_size
            self.drop = tt.nn.Dropout()

        def forward(self, x, hidden):
            batch_size = x.size(0)
            total_length = x.size(-1)

            x = self.encoder(x)
            x = self.drop(x)

            x, hidden = self.rnn(x, hidden)

            x = self.drop(x)
            x = x.contiguous().view(batch_size * total_length, -1)
            x = self.fc(x)
            x = x.contiguous().view(batch_size, total_length, -1)
            return x, hidden

        def init_hidden(self, batch_size):
            return (tt.zeros(2, batch_size, self.hidden_size),
                tt.zeros(2, batch_size, self.hidden_size))

In [142]:
def perplexity(x):
    return 2**x

def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()
    running_loss = 0
    hidden = model.init_hidden(30)
    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        if batch.text.size(0) != 30:
                continue
        optimizer.zero_grad()
        hidden = (hidden[0].detach(), hidden[1].detach())
        pred, hidden = model(batch.text, hidden)
        pred_ = pred.view(-1, len(TEXT.vocab.itos))
        loss = criterion(pred_, batch.target.view(-1))
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    hidden = model.init_hidden(30)
    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            if batch.text.size(0) != 30:
                continue
            pred, hidden = model(batch.text, hidden)
            pred_ = pred.view(-1, len(TEXT.vocab.itos))
            loss = criterion(pred_, batch.target.view(-1))
            epoch_loss += loss.data.item()
            hidden = (hidden[0].detach(), hidden[1].detach())

    return perplexity(epoch_loss / n_batches)

In [143]:
def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_perp  = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        perplexity = _test_epoch(model, valid_iterator, criterion)
        perplexity = perplexity
        print('perplexity %.5f' % perplexity)

        record = {'epoch': epoch, 'train_loss': train_loss, 'perplexity': perplexity}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if perplexity > prev_perp:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.perplexity == history.perplexity.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['perplexity']))
                break

            prev_perp = min(prev_perp, perplexity)

    return model

In [144]:
batch_size = 32

model = MyModel(vocab_size=len(TEXT.vocab.itos),
                embed_size=128,
                hidden_size=128,
               )

train_iterator, test_iterator = BPTTIterator.splits(
    (train, test),
    bptt_len=30,
    batch_sizes=(batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True
)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

# padding does not count into loss
criterion = nn.CrossEntropyLoss()

In [145]:
my_model = nn_train(model, train_iterator, test_iterator,
                     criterion, optimizer, scheduler=scheduler, 
                     n_epochs=100, early_stopping=5)

perplexity 6.22797


perplexity 6.10721


perplexity 6.05275


perplexity 6.02128


perplexity 6.00120


perplexity 5.98604


perplexity 5.97619


perplexity 5.96920


perplexity 5.96363


perplexity 5.95877


perplexity 5.95721


perplexity 5.95555


perplexity 5.95356


perplexity 5.95188


perplexity 5.95007


perplexity 5.95045


perplexity 5.94916


perplexity 5.94889


perplexity 5.94760


perplexity 5.94756


perplexity 5.94865


perplexity 5.94946


perplexity 5.94889


perplexity 5.94974


perplexity 5.95079
Early stopping! best epoch: 19 val 5.94756


In [146]:
tt.save(my_model, 'model.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [155]:
def generate(model, prime_str='<eos>', max_len=10, temp=0.8):
    hidden = model.init_hidden(1)
    inp = tt.tensor(TEXT.vocab.itos.index(prime_str)).unsqueeze(0).unsqueeze(0).long()
    predicted = ''

    _, hidden = model(inp, hidden)
            
    for p in range(max_len):
        output, hidden = model(inp, hidden)
        output_ = output.data.view(-1).div(temp).exp()
        top = tt.multinomial(output_, 1)[0]

        predicted_char = TEXT.vocab.itos[top]       
        if predicted_char == '<eos>' or ' ':
            return predicted
        predicted += predicted_char
        inp = tt.tensor(TEXT.vocab.itos.index(predicted_char)).unsqueeze(0).unsqueeze(0).long()

    return predicted

In [154]:
for i in range(20):
    print(generate(my_model))

Den Techer
orsssty
Strriaring
Bagsere Sa
Med
Siovanis
Lut Norion
Caspis
Ce
Den
orcriatior
Scens
Thema
Exte Hacrd
Rende'sest
Respes
Fas Sad Pu
Surtrath I
Homoncign
Deshetepes


Я решила попробовать поизменять prime_str. Получилось 2 варианта: с пробелом и eos

In [156]:
for i in range(20):
    print(generate(my_model))

Rerarondes
Coury
Int
Obrialicre
Cons
Vaoleng o 
Fod
Coronf Ava
Alold
De S.
Cany
Karalacr
Ded
Mititicte 
Cerracise 
Inal
Destin Abe
Deroroneme
Fanth
Matherprbo
