<a href="https://colab.research.google.com/github/Martin-Sokolov-Sokolov/ESPMesh/blob/main/my.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from google.colab import files
uploaded = files.upload()

Saving train.py to train.py


In [4]:
import torch

startChar = 'ш'
endChar = 'щ'
unkChar = 'ь'
padChar = 'ъ'

import random

corpusSplitString = ';)\n'
maxProgramLength = 10000
symbolCountThreshold = 100

def splitSentCorpus(fullSentCorpus, testFraction = 0.1):
    random.seed(42)
    random.shuffle(fullSentCorpus)
    testCount = int(len(fullSentCorpus) * testFraction)
    testSentCorpus = fullSentCorpus[:testCount]
    trainSentCorpus = fullSentCorpus[testCount:]
    return testSentCorpus, trainSentCorpus

def getAlphabet(corpus):
    symbols={}
    for s in corpus:
        for c in s:
            if c in symbols: symbols[c] += 1
            else: symbols[c]=1
    return symbols

def prepareData(corpusFileName, startChar, endChar, unkChar, padChar):
    file = open(corpusFileName,'r', encoding="utf")
    poems = file.read().split(corpusSplitString)
    symbols = getAlphabet(poems)
    
    assert startChar not in symbols and endChar not in symbols and unkChar not in symbols and padChar not in symbols
    charset = [startChar,endChar,unkChar,padChar] + [c for c in sorted(symbols) if symbols[c] > symbolCountThreshold]
    char2id = { c:i for i,c in enumerate(charset)}
    
    corpus = []
    for i,s in enumerate(poems):
        if len(s) > 0:
            corpus.append( [startChar] + [ s[i] for i in range(min(len(s),maxProgramLength)) ] + [endChar] )

    testCorpus, trainCorpus  = splitSentCorpus(corpus, testFraction = 0.01)
    print('Corpus loading completed.')
    return testCorpus, trainCorpus, char2id

# New Section

In [136]:
#############################################################################
### Търсене и извличане на информация. Приложение на дълбоко машинно обучение
### Стоян Михов
### Зимен семестър 2020/2021
#############################################################################
###
### Домашно задание 3
###
#############################################################################

import torch

#################################################################
####  LSTM с пакетиране на партида
#################################################################

class LSTMLanguageModelPack(torch.nn.Module):
    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        m = max(len(s) for s in source)
        sents = [[self.word2ind.get(w,self.unkTokenIdx) for w in s] for s in source]
        sents_padded = [ s+(m-len(s))*[self.padTokenIdx] for s in sents]
        return torch.t(torch.tensor(sents_padded, dtype=torch.long, device=device))
    
    def save(self,fileName):
        torch.save(self.state_dict(), fileName)
        
    def load(self,fileName):
        self.load_state_dict(torch.load(fileName))

    def __init__(self, embed_size, hidden_size, word2ind, unkToken, padToken, endToken, lstm_layers, dropout):
        super(LSTMLanguageModelPack, self).__init__()
        #############################################################################
        ###  Тук следва да се имплементира инициализацията на обекта
        ###  За целта може да копирате съответния метод от програмата за упр. 13
        ###  като направите добавки за повече слоеве на РНН и dropout
        #############################################################################
        #### Начало на Вашия код.

        self.word2ind = word2ind
        self.unkTokenIdx = word2ind[unkToken]
        self.padTokenIdx = word2ind[padToken]
        self.endTokenIdx = word2ind[endToken]
        
        self.lstm = torch.nn.LSTM(embed_size, hidden_size, num_layers=lstm_layers)
        self.dropout = torch.nn.Dropout(p=dropout, inplace=False)
        self.embed = torch.nn.Embedding(len(word2ind), embed_size)
        self.projection = torch.nn.Linear(hidden_size,len(word2ind))

        #### Край на Вашия код
        #############################################################################

    def forward(self, source):
        #############################################################################
        ###  Тук следва да се имплементира forward метода на обекта
        ###  За целта може да копирате съответния метод от програмата за упр. 13
        ###  като направите добавка за dropout
        #############################################################################
        #### Начало на Вашия код.

        X = self.preparePaddedBatch(source) # (w,s)
        E = self.embed(X)
        source_lengths = [len(s) for s in source]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths,enforce_sorted=False))
        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked)
        D = self.dropout(output)
        Z = self.projection(D.flatten(0, 1)) # (w*s,h)
        return Z

        #### Край на Вашия код
        #############################################################################


In [7]:
import pickle
from parameters import *


testCorpus, trainCorpus, char2id =  prepareData('corpusFunctions.txt', startChar, endChar, unkChar, padChar)
pickle.dump(testCorpus, open(testDataFileName, 'wb'))
pickle.dump(trainCorpus, open(trainDataFileName, 'wb'))
pickle.dump(char2id, open(char2idFileName, 'wb'))
print('Data prepared.')

Corpus loading completed.
Data prepared.


In [191]:
import numpy as np

def generateCode(model, char2id, startSentence, limit=1000, temperature=1.):
    # model е инстанция на обучен LSTMLanguageModelPack обект
    # char2id е речник за символите, връщащ съответните индекси
    # startSentence е началния низ стартиращ със символа за начало '{'
    # limit е горна граница за дължината на поемата
    # temperature е температурата за промяна на разпределението за следващ символ
    
    result = startSentence[0:]
    softmax = torch.nn.Softmax()

    #############################################################################
    ###  Тук следва да се имплементира генерацията на текста
    #############################################################################
    #### Начало на Вашия код.

    #rnd = np.random.choice(len(result), size=1)

    result = softmax(model.forward(result))

    sum = 0
    for i in range(len(result)):
      for j in range(107):
        sum += torch.Tensor.item(result[i][j])

    print(sum)
    print(torch.Tensor.size(result))

    #result = data[rnd]

    #result = [char2id[i] for i in result]
    
    #### Край на Вашия код
    #############################################################################

    return result


In [14]:
import train

testCorpus = pickle.load(open(testDataFileName, 'rb'))
char2id = pickle.load(open(char2idFileName, 'rb'))
print('Model perplexity: ',train.perplexity(lm, testCorpus, batchSize))

Model perplexity:  4.40980754541837


In [148]:
char2id = pickle.load(open(char2idFileName, 'rb'))
lm = LSTMLanguageModelPack(char_emb_size, hid_size, char2id, unkChar, padChar, endChar, lstm_layers=lstm_layers, dropout=dropout).to(device)
lm.load(modelFileName)

In [192]:
generateCode(lm, char2id, startChar, 1000, 1.0)

1.000000039105089
torch.Size([1, 107])




tensor([[1.1802e-05, 3.0190e-05, 1.2916e-03, 8.4763e-06, 5.5835e-06, 8.3459e-05,
         5.5392e-04, 8.0156e-06, 4.0735e-04, 3.8413e-04, 1.0797e-04, 1.9476e-04,
         7.7275e-05, 1.2647e-03, 2.9246e-03, 1.5336e-05, 2.5417e-05, 7.8793e-05,
         6.0266e-05, 2.4226e-03, 6.5597e-04, 1.8432e-04, 4.4040e-04, 1.5358e-04,
         1.1846e-04, 9.9550e-05, 6.6051e-05, 9.4099e-05, 6.0298e-05, 3.2746e-05,
         2.9066e-05, 4.9850e-05, 8.5693e-03, 3.7776e-05, 4.5627e-05, 1.0682e-04,
         2.4605e-04, 3.0884e-05, 6.4833e-04, 3.6123e-02, 6.6504e-03, 1.1244e-01,
         3.6679e-02, 1.2075e-02, 6.4187e-02, 6.0786e-02, 7.9161e-03, 1.9561e-02,
         1.9173e-03, 1.9583e-03, 1.9500e-02, 1.7521e-02, 1.1916e-02, 3.3685e-03,
         2.9232e-02, 1.3378e-03, 2.8008e-01, 6.3184e-02, 1.1889e-01, 1.9421e-02,
         8.1673e-03, 1.6678e-02, 2.3660e-03, 2.2726e-03, 7.1750e-04, 2.6895e-04,
         3.6840e-04, 5.7256e-04, 8.9862e-05, 3.3731e-04, 6.0967e-04, 4.4285e-03,
         2.5636e-04, 1.1766e