In [35]:
import pandas as pd
import json
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [41]:
BOS, EOS = ' ', '\n'
PAD = "<PAD>"
with open('lyrics.json', 'r') as file:
    text = str(json.load(file))

couplets = re.findall(r'Verse.*?(?=Chorus|Verse|\Z)', text, re.DOTALL)
choruses = re.findall(r'Chorus.*?(?=Verse|Chorus|\Z)', text, re.DOTALL)

cleaned_couplets = [re.sub(r'\[.*?\]', '', couplet).strip() for couplet in couplets]
cleaned_choruses = [re.sub(r'\[.*?\]', '', chorus).strip() for chorus in choruses]

lines = ' '.join(cleaned_couplets + cleaned_choruses)

In [42]:
tokens = set(" ".join(lines))
tokens = sorted(tokens)
n_tokens = len(tokens)
print ('n_tokens = ',n_tokens)

n_tokens =  134


In [43]:
char_to_int = {symbol: index for index, symbol in enumerate(tokens, start=1)}
char_to_int[PAD] = 0


n_tokens = len(char_to_int)
n_tokens

135

In [44]:
class SymbolDataset(Dataset):

    def init(self, data, token_to_ids):
        self.data = data
        self.token_to_ids = token_to_ids

    def len(self):
        return len(self.data)

    def getitem(self, index):
        symbols = [self.token_to_ids[symbol] for symbol in self.data[index]]
        symbols.append(self.token_to_ids['EOS'])
        return symbols

In [46]:
train_lines, dev_lines = train_test_split(lines, test_size=0.25, random_state=42)
train_dataset = SymbolDataset(train_lines, char_to_int)
train_loader = DataLoader(train_dataset, batch_size=64)

In [47]:
def collation(batch, pad=char_to_int[PAD], dtype=np.int64):
    max_len = max(map(len, batch))
    lines_ix = np.full([len(batch), max_len], pad, dtype=dtype)
    for i in range(len(batch)):
        lines_ix[i, :len(batch[i])] = batch[i]
    return torch.from_numpy(lines_ix)

In [48]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collation)
test_dataset = SymbolDataset(dev_lines, char_to_int)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collation)

In [30]:
class LSTM(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, dropout=0.5):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout, num_layers=2)
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.fc = nn.Linear(hidden_dim, n_vocab)

    def forward(self, s):
        embedded = self.embeddings(s.t())
        lstm_out, _ = self.lstm(embedded)
        ht = lstm_out[-1]
        out = self.fc(ht)
        return out

In [18]:
PAD = "<PAD>"
char_to_int[PAD]=0
class Window(nn.Module):
    def __init__(self, n_tokens=90, emb_size=32, hid_size=128):
        super().__init__()
        PAD = "<PAD>"
        stride = 1
        kernel_size = 5
        num_leading_zeros = (kernel_size - 1) * stride
        self.embedding = nn.Embedding(n_tokens, emb_size,  padding_idx=char_to_int[PAD])
        self.padding = nn.ZeroPad2d((num_leading_zeros, 0, 0, 0))
        self.conv = nn.Conv1d(emb_size, hid_size, kernel_size=kernel_size, stride=stride)
        self.lr = nn.Linear(hid_size, n_tokens)

    def __call__(self, input_ix):
        emb = self.embedding(input_ix)
        emb = emb.permute(0, 2, 1)
        padded = self.padding(emb)
        conved = self.conv(padded)
        relu = F.relu(conved)
        relu = relu.permute(0, 2, 1)
        out = self.lr(relu)
        return out

In [31]:
model = LSTM(140,256,256)
model.cuda()
model2 = Window()
model2.cuda()

Window(
  (embedding): Embedding(90, 32, padding_idx=0)
  (padding): ZeroPad2d((4, 0, 0, 0))
  (conv): Conv1d(32, 128, kernel_size=(5,), stride=(1,))
  (lr): Linear(in_features=128, out_features=90, bias=True)
)

In [29]:
train_history =[]
dev_history = []

n_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(n_epochs):
    model.train()
    loss_fn = torch.nn.CrossEntropyLoss()
    avg_loss = 0.
    for i, (x_batch, y_batch) in enumerate(train_loader):
        y_pred = model(x_batch)

        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    print('Epoch',epoch + 1, "Loss",avg_loss)

    total_batches = 0.0
    test_loss = 0.0
    test_perplexity = 0.0
    model.eval()
    for i, (x_batch, y_batch) in enumerate(test_loader):
        with torch.no_grad():
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            test_loss += loss.item()
            total_batches += 1

    average_loss = test_loss / total_batches
    perplexity = torch.exp(torch.tensor(average_loss))
    print(f'Epoch {epoch + 1}, Perplexity: {perplexity.item()}')

Epoch 1 Loss 0.9409210691979851
Epoch 1, Perplexity: 3.2265830039978027
Epoch 2 Loss 0.8923759344333851
Epoch 2, Perplexity: 3.1833624839782715
Epoch 3 Loss 0.8491629263697136
Epoch 3, Perplexity: 3.1415321826934814
Epoch 4 Loss 0.8114479335678462
Epoch 4, Perplexity: 3.104315996170044
Epoch 5 Loss 0.7787136187062843
Epoch 5, Perplexity: 3.1114096641540527
Epoch 6 Loss 0.7478762851833483
Epoch 6, Perplexity: 3.0973474979400635
Epoch 7 Loss 0.7218480091343985
Epoch 7, Perplexity: 3.114409923553467
Epoch 8 Loss 0.7025173436186316
Epoch 8, Perplexity: 3.1059584617614746
Epoch 9 Loss 0.6780534099421692
Epoch 9, Perplexity: 3.11103892326355
Epoch 10 Loss 0.6628822844202928
Epoch 10, Perplexity: 3.0989725589752197


In [32]:

train_history =[]
dev_history = []
n_epochs = 10
optimizer = torch.optim.Adam(model2.parameters(), lr=1e-3)
for epoch in range(n_epochs):
    model2.train()
    loss_fn = torch.nn.CrossEntropyLoss()
    avg_loss = 0.
    for i, (x_batch, y_batch) in enumerate(train_loader):
        out = model2(x_batch)
        out = out.view(out.shape[0] * out.shape[1], -1)
        y = y_batch.flatten()
        loss_i = F.cross_entropy(out, y, ignore_index=char_to_int[PAD])
        optimizer.zero_grad()
        loss_i.backward()
        optimizer.step()

        avg_loss += loss.item() / len(train_loader)
    print('Epoch',epoch + 1, "Loss",avg_loss)

    total_batches = 0.0
    test_loss = 0.0
    test_perplexity = 0.0
    model2.eval()
    for i, (x_batch, y_batch) in enumerate(test_loader):
        with torch.no_grad():
            out = model2(x_batch)
            out = out.view(out.shape[0] * out.shape[1], -1)
            y = y_batch.flatten()
            loss_i = F.cross_entropy(out, y, ignore_index=char_to_int[PAD])
            dev_history.append(((epoch + 1) * len(train_loader) + i, loss_i.item()))
            test_loss += loss_i.item()
            total_batches += 1

    average_loss = test_loss / total_batches
    perplexity = torch.exp(torch.tensor(average_loss))
    print(f'Epoch {epoch + 1}, Perplexity: {perplexity.item()}')

Epoch 1 Loss 2.45899752261417
Epoch 1, Perplexity: 7.7531633377075195
Epoch 2 Loss 1.9761667397709637
Epoch 2, Perplexity: 5.886527061462402
Epoch 3 Loss 1.7589298354751832
Epoch 3, Perplexity: 5.001465320587158
Epoch 4 Loss 1.613346935064068
Epoch 4, Perplexity: 4.513491630554199
Epoch 5 Loss 1.5030794097827034
Epoch 5, Perplexity: 4.187100410461426
Epoch 6 Loss 1.4171446672523695
Epoch 6, Perplexity: 3.9566500186920166
Epoch 7 Loss 1.3435874333077489
Epoch 7, Perplexity: 3.7946159839630127
Epoch 8 Loss 1.2842630781256614
Epoch 8, Perplexity: 3.6342613697052
Epoch 9 Loss 1.2339607620701478
Epoch 9, Perplexity: 3.5452473163604736
Epoch 10 Loss 1.1877307793063776
Epoch 10, Perplexity: 3.4760284423828125


In [50]:
int_to_char = {i: symbol for symbol, i in char_to_int.items()}

In [52]:
def generate(model, prompt, max_len=10000, temperature=0.2):
    model.eval()
    token_ids = [char_to_int[symbol] for symbol in prompt]
    input_tensor = torch.LongTensor(token_ids).unsqueeze(0).cuda()
    generated_symbols = []

    criterion = torch.nn.CrossEntropyLoss()

    total_prob = 0.0
    num_tokens = 0

    while len(generated_symbols) < max_len:
        with torch.no_grad():
            out = model(input_tensor)[:, -1, :]
            if temperature == 0:
                max_v, ind = torch.max(out, axis=1)
                token = ind.cpu()[0].item()
            else:
                probs = F.softmax(out / temperature, dim=1)
                token = np.random.choice(list(range(len(char_to_int))), p=probs[0].cpu().numpy())
                ind = torch.LongTensor([token]).cuda()

            if token == char_to_int[EOS]:
                break

            loss = criterion(out, torch.LongTensor([token]).cuda())
            probs += loss.item()
            num_tokens += 1

            generated_symbols.append(int_to_char[token])
            input_tensor = torch.cat([input_tensor, ind.unsqueeze(0)], axis=1)

    perplexity = torch.exp(probs / num_tokens)
    generated_text = prompt + "".join(generated_symbols)

    return perplexity, generated_text

In [36]:
prompt = 'Все таки тогда ты была не'
perplexity, text = generate(model, prompt)
text

Все таки тогда ты была не подоконите
Это не ссомные глаза
И всё направився не закатые класты
От меня нашей силые любви
От всех не насилие, что мне нужно столос
Я растолком вино пеходи (Прощайствой
Выдыха-а-ай
Выдыха-а-ай
Выдыха-а-а-а
Выдыха-а-ай
Выдыха-а-ай
Выдыха-а-ай
Выдыха-а-ай
Выдыха-а-ай
Выдыха-а-ай
Выдыхаешь об асфал)
Всё это очень скучно?
На небо сточно навсегда
Не будем домой, как в этом со множем спроство
В этом не ляжем спроствя
Мы не устало на пороге
Чужие для меня
Эта остался всё закроется планеты
Что меня рядом нету
Не пятаю алкоголь красивая
И открые слазки всегда
Ты убиваешь меня свои глаза
Не смей мешать от темперенось в это небо скучно
Но ты снова до твоих глазах
Твои глаза, словно петей, как всегда, как всегда, в этом нету

Каждый новый квартире весной вины
Не знаю, что такое себе в мире красота
В моей квартире весна, медленно сходим с ума
Ты ушила и твои слова — ведь наплетам

Две всё же твои слова всё рассказать
Все от скроешь меня не закатывала всё из-за тебя
Это о

In [53]:
perplexity

3.358577251434318

In [38]:
perplexity2, text2 = generate(model2, prompt, 0.1)
text2

Все таки тогда ты была не вина
Когда же меня не подал (Я убежать
И снова бесконечный раз
А я хочу тебя колно запрошло
Как же меня зацепило
Не виноваты планеты
Что меня рядом нету

Я люблю и ненавижу тебя
Sk8er boi
(udbye, my love
Goodbye, my love
Goodbye
Gou’re bvens
I hate you sk8er boi
I hate you paking
I stupst this love
Goodbye, that you sk8er boi
I hate you sk8er boi
I hate you sk8er boi
Coke kryptonite
I want you sk8er boi
And this like kryptonite, that in this Eplone
And thap there you there, and I hate this
Goken there this
Gotery puts
And thrould sould spin all of this was a dream (Slowly taking an inhale again)
If only all of my wounds an whore
It's jut sk8er boi
I want you sk8er boi
I hate you love (Fake love
Goodbye, my love), 
I’m (Sk8er boi
(kake love), lave in. That I dont this was a dream
If only all of chould I pant this was a dream (Taking an inhale again)

You’t a dream (Taking an. I don't this was a dream (Taking an inhale again)
If only all of this was a dream, my 

In [54]:
perplexity2

5.45659725149364