In [1]:
import urllib
import nltk
nltk.download('punkt')
import re

import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import numpy as np


from collections import Counter

import zipfile
import sqlite3
import pandas as pd
from itertools import chain
import gc

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Задание 1

In [None]:
data = urllib.request.urlopen('https://s3.amazonaws.com/text-datasets/nietzsche.txt')

In [None]:
raw_text = ""
for i in data:
    raw_text += i.decode('utf-8').lower()

In [None]:
print(f'corpus len = {len(raw_text)}\nsentence count = {len(nltk.sent_tokenize(raw_text))}\nchars used = {len(set(raw_text))}')

corpus len = 600893
sentence count = 2864
chars used = 57


# Задание 2

In [None]:
seq_len = 40
chars = sorted(list(set(raw_text)))
char_to_int = {c: i for i, c in enumerate(chars)}
dataX = []
dataY = []
n_chars = len(raw_text)
n_vocab = len(char_to_int)
for i in range(0, n_chars - seq_len, 2):
    seq_in = raw_text[i: i + seq_len]
    seq_out = raw_text[i + seq_len]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append([char_to_int[seq_out]])
print(len(dataX))

300427


# Задание 3

## Вспомогательные ф-ции

In [None]:
# создание датасета

class Dataset(torch.utils.data.Dataset):
    def __init__(self, seq_len):
        self.seq_len = seq_len
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.word_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        data = urllib.request.urlopen('https://s3.amazonaws.com/text-datasets/nietzsche.txt')
        raw_text = ""
        for i in data:
            raw_text += i.decode('utf-8').lower()
        return list(nltk.word_tokenize(raw_text))

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.word_indexes) - self.seq_len

    def __getitem__(self, index):
        return (torch.tensor(self.word_indexes[index:index+self.seq_len]),
                torch.tensor(self.word_indexes[index+1:index+self.seq_len+1]))


In [None]:
class Model(torch.nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 256
        self.num_layers = 3
        self.seq_len = dataset.seq_len

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim
        )
        self.lstm = nn.LSTM(
            input_size = self.embedding_dim,
            hidden_size = self.lstm_size,
            num_layers = self.num_layers,
            dropout = 0.2,
            batch_first=True
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x):
        x = self.embedding(x)
        x, state = self.lstm(x)
        x = self.fc(x)
        return x

    def init_state(self):
        return (torch.zeros(self.num_layers, self.seq_len, self.lstm_size),
                torch.zeros(self.num_layers, self.seq_len, self.lstm_size))



In [None]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()

    epoch_loss = []
    for (local_batch, local_label) in dataloader:
        (local_batch, local_label) = (local_batch.to(device), local_label.to(device))
        optimizer.zero_grad()

        y_pred = model.forward(local_batch)
        curr_loss = criterion.forward(y_pred.transpose(1, 2), local_label)

        curr_loss.backward()
        optimizer.step()
        epoch_loss.append(curr_loss.item())
    return np.mean(epoch_loss)

In [None]:
def predict(dataset, model, text, device, next_words=20):
    model.eval()

    words = text.split(' ')
    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i::]]]).to(device)
        y_pred = model.forward(x)

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().to('cpu').numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])
    return " ".join(words)

## init

In [None]:
lr = 0.001
batch_size = 300
num_epochs = 30
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache() if device == 'cuda' else None

In [None]:
dataset = Dataset(8)

In [None]:
data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=0)

In [None]:
model = Model(dataset).to(device)

In [None]:
print(f'кол-во параметров в модели = {sum(p.numel() for p in model.parameters() if p.requires_grad)}')

кол-во параметров в модели = 4701444


In [None]:
print('is cuda? -',next(model.parameters()).is_cuda)

is cuda? - True


In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=lr,
)

## train & pred

In [None]:
train_loss = []
text = 'the truth is'
for epoch in range(num_epochs):
    epoch_train_loss = train_one_epoch(model, data_loader, criterion, optimizer, device)
    train_loss.append(epoch_train_loss)

    with torch.no_grad():
        print(f'epochN={epoch+1}, pred text = {predict(dataset, model, text, device)}')
torch.cuda.empty_cache() if device == 'cuda' else None

epochN=1, pred text = the truth is in by maturity satisfaction begins , more too himself would , estimations free in , an and good , truth
epochN=2, pred text = the truth is suffering to , that almost , according are breath already the =the so , necessary . my no of reproach
epochN=3, pred text = the truth is the whole ( himself that is was who without nothing , c'est-a-dire to such farcical the history of those ,
epochN=4, pred text = the truth is they its first pity -- if no utmost envy as in the purpose , now in matter , in the
epochN=5, pred text = the truth is always things with the invented ramification by moral acts in rank -- which packed instantly , '' '' '' are
epochN=6, pred text = the truth is thus eloquence fast to satisfactory disgust ) . as `` the else over learn to separates the objective individual ,
epochN=7, pred text = the truth is rarely blessed as a thing ? and he did one who , or already stare and self fascinatingly towards effaced
epochN=8, pred text = the truth

# Задание 4

In [2]:
!pip install -q kaggle

In [3]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"voltjunkie","key":"b8039b43db8bce188ad021ee693b99f0"}'}

In [4]:
! mkdir ~/.kaggle

In [5]:
! cp kaggle.json ~/.kaggle/

In [6]:
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
!kaggle datasets download -d dhruvildave/wikibooks-dataset/

Downloading wikibooks-dataset.zip to /content
100% 1.82G/1.82G [00:19<00:00, 47.8MB/s]
100% 1.82G/1.82G [00:19<00:00, 99.4MB/s]


In [6]:
with zipfile.ZipFile(r'C:\Users\user\Desktop\dz\nlp\data\wikibooks-dataset.zip', 'r') as zip:
  zip.extractall(r'C:\Users\user\Desktop\dz\nlp\data')

## Вспомогательные ф-ции

In [2]:
# создание датасета

class Dataset(torch.utils.data.Dataset):
    def __init__(self, seq_len):
        self.seq_len = seq_len
        self.words = self.load_df()

        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.word_indexes = [self.word_to_index[w] for w in self.words]


    def load_df(self):
        conn = sqlite3.connect(r'C:\Users\user\Desktop\dz\nlp\data\wikibooks.sqlite')
        cur = conn.cursor()
        cur.execute("SELECT * FROM ru;")
        data = cur.fetchall()

        cur.execute("PRAGMA table_info(ru);")
        column_names = cur.fetchall()
        column_names = [column[1] for column in column_names]

        df = pd.DataFrame(data, columns=column_names)
        df['title'] = df['title'].str.lower()
        sequences = list(df['title'])
        words = [[word for word in list(nltk.word_tokenize(seq))] for seq in sequences]
        words = list(chain.from_iterable(words))
        return words

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.word_indexes) - self.seq_len

    def __getitem__(self, index):
        return (torch.tensor(self.word_indexes[index:index+self.seq_len]),
                torch.tensor(self.word_indexes[index+1:index+self.seq_len+1]))


In [3]:
class Model(torch.nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 256
        self.num_layers = 3
        self.seq_len = dataset.seq_len

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim
        )
        self.lstm = nn.LSTM(
            input_size = self.embedding_dim,
            hidden_size = self.lstm_size,
            num_layers = self.num_layers,
            dropout = 0.2,
            batch_first=True
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x):
        x = self.embedding(x)
        x, state = self.lstm(x)
        x = self.fc(x)
        return x

In [4]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()

    epoch_loss = []
    for (local_batch, local_label) in dataloader:
        (local_batch, local_label) = (local_batch.to(device), local_label.to(device))
        optimizer.zero_grad()

        y_pred = model.forward(local_batch)
        curr_loss = criterion.forward(y_pred.transpose(1, 2), local_label)

        curr_loss.backward()
        optimizer.step()
        epoch_loss.append(curr_loss.item())
    return np.mean(epoch_loss)

In [5]:
def predict(dataset, model, text, device, next_words=14):
    model.eval()

    words = text.split(' ')
    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i::]]]).to(device)
        y_pred = model.forward(x)

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().to('cpu').numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])
    return " ".join(words)

## init

In [11]:
lr = 0.001
batch_size = 40
num_epochs = 30
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache() if device == 'cuda' else None

In [12]:
dataset = Dataset(13)

In [13]:
data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=0)

In [14]:
model = Model(dataset).to(device)

In [15]:
print(f'кол-во параметров в модели = {sum(p.numel() for p in model.parameters() if p.requires_grad)}')

кол-во параметров в модели = 5013294


In [16]:
print('is cuda? -',next(model.parameters()).is_cuda)

is cuda? - True


In [17]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=lr,
)

In [18]:
pred = " ".join([dataset.index_to_word.get(w, "") for w in dataset[0][0].tolist()])

In [19]:
pred

'викиучебник : техника и технология средств массовой информации/интернет/техника викиучебник : аон/пилотское свидетельство викиучебник'

In [20]:
model

Model(
  (embedding): Embedding(11822, 256)
  (lstm): LSTM(256, 128, num_layers=3, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=128, out_features=11822, bias=True)
)

## train & pred

In [21]:
train_loss = []
text = 'Викиучебник : Техника и'.lower()
for epoch in range(num_epochs):
    epoch_train_loss = train_one_epoch(model, data_loader, criterion, optimizer, device)
    train_loss.append(epoch_train_loss)

    with torch.no_grad():
        print(f'epochN={epoch+1}, pred text = {predict(dataset, model, text, device)}')
torch.cuda.empty_cache() if device == 'cuda' else None

epochN=1, pred text = викиучебник : техника и справочник/kodak 1998 викиучебник : коктейли/смородиновая разработок викиучебник : коктейли/колавайцен викиучебник : техника викиучебник :
epochN=2, pred text = викиучебник : техника и примерах/управляющие многомерных . рекомендациям викиучебник : аон/пилотское диз викиучебник : интересное 24/стадии экспертиза по
epochN=3, pred text = викиучебник : техника и технология средств массовой информации/телевидение/технология викиучебник : коктейли/тайлер регулирование урок на социальные миндалин викиучебник :
epochN=4, pred text = викиучебник : техника и технология средств массовой информации/печатная школы/теорема комплекса викиучебник : русско-сербский психология/пролонгированные викиучебник : коктейли/рашин мохито
epochN=5, pred text = викиучебник : техника и технология средств массовой информации/радио/история радио викиучебник : коктейли/дейзи викиучебник : интеллектуальное право/примеры рунический 09.12.2016
epochN=6, pred text = викиучебни