In [1]:
import torch
import torch.nn as nn
import random
import numpy as np
import Tokens as tk
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VOCAB_SIZE = 1201

разбивка текста и создание словаря

In [2]:
import os

samples = []
for sample in os.listdir('samples'):
    with open("samples/" + sample, encoding="utf-8") as text:
        samples.append(text.read().lower())


class QAPair:
    def __init__(self, question, answer):
        self.question = question
        self.answer = answer

dataset = []

for sample in samples:
    lines = sample.splitlines()
    questions = lines[::2]
    answers = lines[1::2]
    for q, a in zip(questions, answers):
        dataset.append(QAPair(q, a))


In [3]:
tdg = tk.TokenDictionaryGenerator(vocabulary_size = VOCAB_SIZE-1)
tokens = tdg.generate_tokens(samples)
tokenizer = tk.Tokenizer(tokens)
tokens.save("tokens.json")

создание модели 
предложение -> hidden
последние слово + hidden -> слово(1)...слово(n)

In [79]:
class RnnTextGen(nn.Module):

    def __init__(self, vocab_size, input_size, hid_size, n_layers, dropout=0.2) -> None:
        super().__init__()
        self.n_layers = n_layers
        self.hidden_size = hid_size
        self.Encoder = nn.Embedding(vocab_size, input_size)
        self.lstm = nn.LSTM(input_size, hid_size, n_layers, dropout = dropout)
        self.l1 = nn.Linear(hid_size, vocab_size+1)

    def forward(self, x, hidden=None):
        x = self.Encoder(x)
        x, hidden = self.lstm(x, hidden)
        x = x[-1]
        x = self.l1(x)
        return x, hidden
    
    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
               torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

обучение модели

In [75]:
def evaluate(model, tokenizer, Endt, text: str, prediction_lim: int = 15, device='cpu'):
    text_tensor = torch.LongTensor(list(tokenizer.tokenize(text))).to(device)
    hidden = model.init_hidden()
    predicted_text = ""
    Endt = Endt.to(device)
    for i in range(prediction_lim):
        next_w, hidden = model(text_tensor.to(device), hidden)
        text_tensor = torch.cat([text_tensor, next_w.argmax().view(-1)])
        if next_w.argmax() == Endt:
            break
        word = tokenizer.decode_token(int(next_w.argmax()))
        predicted_text += word
    return predicted_text

In [76]:
def train(epoches: int, model: nn.Module, device: str, tokenizer, dataset) -> None:
    """epoches - number of epoches through all dataset
    model - model required to teach
    batch_size - n/a"""
    loss_func = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=5,
        verbose=True,
        factor=0.5
    )

    def get_batch(dataset: list):
        for qa in dataset:
            question_idx = list(tokenizer.tokenize(qa.question))
            target = list(tokenizer.tokenize(qa.answer)) + [tokenizer.count_tokens()]
            test = question_idx+target[:-1]

            target = torch.LongTensor(target).to(device)
            test = torch.LongTensor(test).to(device)
            yield target, test

    loss_avg = []
    model.train()
    for epoch in range(epoches):
        for target, test in get_batch(dataset):

            hidden = model.init_hidden()
            for ttarget, ttrain in zip(target, test):
                output, hidden = model(ttrain, hidden)

                loss = loss_func(output, ttarget)

                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                loss_avg.append(loss.item())
            if len(loss_avg) >= 50:
                mean_loss = np.mean(loss_avg)
                print(f'Loss: {mean_loss}')
                scheduler.step(mean_loss)
                loss_avg = []


In [81]:
model = RnnTextGen(1200,1000,500,3).to(device)
train(12,model,device,tokenizer,dataset)

AssertionError: LSTM: Expected input to be 2-D or 3-D but received 1-D tensor