In [None]:
!pip install unidecode



In [None]:
import torch
import torch.nn as nn
import pandas as pd
import string
import random
import unidecode
from torch.utils.tensorboard import SummaryWriter

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
all_characters = string.printable #+ 'aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzžAÁBCČDĎEÉĚFGHIÍJKLMNŇOÓPQRŘSŠTŤUÚŮVWXYÝZŽ'
n_characters = len(all_characters)
# all_characters = "".join(set(all_characters))
all_characters

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
PATH_DATA = "/content/gdrive/MyDrive/reviews.csv"
df = pd.read_csv(PATH_DATA)

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: unidecode.unidecode(x).lower())
df['review_text'] = df['review_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation))) 
df = df[df['review_text'].apply(lambda x: len(str(x)) <= 50)]
mask = df['review_text'].str.len() <= 50
reviews = df.loc[mask]
reviews.reset_index(drop=True, inplace=True)

In [None]:
df

Unnamed: 0,review_text
0,snad nejlepsi objektiv sveho druhu
1,kvalita zpracovani i uziti
2,rychlost osetreni
3,pevna svetelnost
4,kresba
...,...
40728,super zoom
40729,hodil by se vyklopny displej
40733,pomale a hlucne ostreni
40734,vyssi cena


In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # input size je = size of the dictionary of embeddings
        # hidden size je = the size of each embedding vector
        self.embed = nn.Embedding(input_size, hidden_size)

        # prva hidden_size – The number of expected features in the input x
        # druha hidden_size – The number of features in the hidden state h - pocet neuronov
        # num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True, bidirectional=False)
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden, cell):
        out = self.embed(x)
        out, (hidden, cell) = self.lstm(out.unsqueeze(1), (hidden, cell))
        out = self.fc(out.reshape(out.shape[0], -1))

        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

        return hidden, cell

In [None]:
chunk_length = 50
num_epochs = 15000
batch_size = 1
print_every = 500
hidden_size = 512
num_layers = 4
lr = 0.003

In [None]:
class Generator():
    def __init__(self):
        self.chunk_length = 100
        self.num_epochs = 150000
        self.batch_size = 1
        self.print_every = 50
        self.hidden_size = 512
        self.num_layers = 4
        self.lr = 0.003

    def char_tensor(self, string):
        tensor = torch.zeros(len(string)).long()
        for c in range(len(string)):
            tensor[c] = all_characters.index(string[c])
        
        return tensor
    
    def get_random_batch(self):
        
        row = random.randint(0, len(reviews)-1)
        text_str = reviews.iloc[row]['review_text'].ljust(self.chunk_length+1,'<')
        
        text_input = torch.zeros(self.batch_size, self.chunk_length)
        text_target = torch.zeros(self.batch_size, self.chunk_length)
        for i in range(self.batch_size):
            text_input[i,:] = self.char_tensor(text_str[:-1])
            text_target[i,:] = self.char_tensor(text_str[1:])

        return text_input.long(), text_target.long()
    
    def generate(self, initial_str="Ab", predict_len=100, temperature=0.85):
        hidden, cell = self.rnn.init_hidden(batch_size=self.batch_size)
        initial_input = self.char_tensor(initial_str)
        predicted = initial_str

        for p in range(len(initial_str) - 1):
            _, (hidden, cell) = self.rnn(initial_input[p].view(1).to(device), hidden, cell)
        
        last_char = initial_input[-1]

        for p in range(predict_len):
            output, (hidden, cell) = self.rnn(last_char.view(1).to(device), hidden, cell)
            output_dist = output.data.view(-1).div(temperature).exp()
            top_char = torch.multinomial(output_dist, 1)[0]
            predicted_char = all_characters[top_char]
            predicted += predicted_char
            last_char = self.char_tensor(predicted_char)
        
        return predicted


    def train(self):
        self.rnn = RNN(n_characters, self.hidden_size, self.num_layers, n_characters).to(device)
        optimizer = torch.optim.Adam(self.rnn.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()
        writer = SummaryWriter(f"runs/names0")
        print_loss_total = 0

        print("=> Starting training")

        for epoch in range(1, self.num_epochs + 1):      
            inp, target = self.get_random_batch()
            hidden, cell = self.rnn.init_hidden(batch_size=self.batch_size)

            self.rnn.zero_grad()
            loss = 0
            inp = inp.to(device)
            target = target.to(device)

            for c in range(self.chunk_length):
                output, (hidden, cell) = self.rnn(inp[:,c], hidden, cell)
                loss += criterion(output, target[:,c])
            
            loss.backward()
            optimizer.step()
            loss = loss.item() / self.chunk_length

            print_loss_total += loss

            if epoch % self.print_every == 0:                
                initial_letter = random.choice("abcdefghijklmnopqrstuvwxyz")
                print(self.generate(initial_str=initial_letter))   

            writer.add_scalar("Training loss", loss, global_step=epoch)   

In [None]:
gen = Generator()
try:
    gen.train()
except Exception as e:
    print(e)
    pass

=> Starting training
g<<<<<<<<<d<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
lnjraonlevrdn gzm7rlaeasoopothj<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
a<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
laiircjea auk opot ra<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
modohu udrza ren< llele<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
epotokane<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
yv nro rbepti ine tlevdltetyzkhlon malitaktauesse na kmostvresapkinhy tacina ty swtvoomaa mnhttycy<<<
zazity h sse heci 5norici vilohnakna<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
x<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
tec  ka ostely sdeji zreni kteu re o sti honi fes<<<<<<<<<<<<

KeyboardInterrupt: ignored