# N-gramski jezični model

In [2]:
import re

import nltk

In [21]:
# Tekst kao lista rečenica, segmentirana na riječi
text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'f']]

In [22]:
from nltk.util import bigrams, ngrams

list(bigrams(text[0]))
list(ngrams(text[0], n = 3))
list(ngrams(text[1], n = 3))

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'f')]

In [23]:
from nltk.util import pad_sequence
list(
    pad_sequence(
        text[0],
        pad_left = True,
        left_pad_symbol = "<s>",
        pad_right = True,
        right_pad_symbol = "</s>",
        n = 2
    )
)

['<s>', 'a', 'b', 'c', '</s>']

In [24]:
from nltk.lm.preprocessing import pad_both_ends

list(
    bigrams(pad_both_ends(text[0], n = 2))
)

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

In [33]:
from nltk.lm.preprocessing import padded_everygram_pipeline # Alat koji proizvodi engrame

train, vocab = padded_everygram_pipeline(2, text)

train_data = list(list(train)[0])
vocab_data = list(vocab)

print(f'text: {text}')
print(f'train: {train_data}')
print(f'vocab: {vocab_data}')


text: [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'f']]
train: [('<s>',), ('<s>', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', '</s>'), ('</s>',)]
vocab: ['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'f', '</s>']


In [51]:
# Složimo sada N-gram model za neki primjer
text = [['a', 'b', 'c'], ['b', 'c', 'd', 'c', 'e', 'f'],['a','f','e']]
test = [('a', 'b'), ('c', 'd')]

from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)

# definiramo modele
from nltk.lm import MLE
lm = MLE(2)
# izracunaj vrijednosti
lm.fit(train, vocab)

# koliko puta se pojavila rijec 'e'
lm.counts['e']

# koliko se puta pojavio bigram 'a b'
lm.counts[['a']]['b']

# kolika je vjerojatnost riječi 'a'
lm.score('a')

# kolika je vjerojatnost da na početku rečenice imamo 'b'
lm.score('b', ['<s>'])

lm.perplexity(test)

2.449489742783178

# N-gram model za patnje mladog Werthera

In [107]:
from nltk.tokenize import sent_tokenize
import re
import random

def prepare(corpora, split = 0.7):
    """
    funkcija učitava tekst i priprema za definiranje modela
    """
    # čitanje iz datoteke
    dataset = open(corpora, 'r', encoding='utf8').read()
    # izbacimo gluposti
    dataset = re.sub('\n+', ' ', dataset)
        
    # rečenice iz teksta
    sents = sent_tokenize(dataset)
    random.shuffle(sents)
    
    # podjela
    index = int(split * len(sents))
    trainset, testset = sents[:index], sents[index:]
    
    with open('/home/isiljic/Desktop/CL/V05/data/train.txt', 'w', encoding='utf8') as train:
        for sent in trainset:
            train.write(sent)
            train.write('\n')
    
    with open('/home/isiljic/Desktop/CL/V05/data/test.txt', 'w', encoding='utf8') as test:
        for sent in testset:
            test.write(sent)
            test.write('\n')


In [108]:
prepare('/home/isiljic/Desktop/CL/V05/data/werther.txt')

1708 733


In [109]:
# uciraj mo ga kao nltk corpus
from nltk.corpus import PlaintextCorpusReader

werther_train = PlaintextCorpusReader('/home/isiljic/Desktop/CL/V05/data/', 'train.txt')

werther_train.sents()

[['Da', ',', 'imaš', 'pravo', ':', 'bilo', 'bi', 'mi', 'bolje', 'da', 'odem', '.'], ['Pogledao', 'sam', 'u', 'Lotu', 'i', 'osjetio', 'sve', 'što', 'je', 'ona', 'meni', '.'], ...]