# Texto to N-gram

In [2]:
def n_gram(text, n_grams=1):
    sequence = text.split()
    n_gram_list = []
    for i in range(len(sequence) - n_grams + 1):
        n_gram_list.append(' '.join(sequence[i:i + n_grams]))
    return n_gram_list

n_gram("I am an IA student.", 1)

['I', 'am', 'an', 'IA', 'student.']

### Train a n-gram model

In [3]:
!pip install nltk



In [4]:
import contextlib
import os
import re
import string
import nltk
from nltk.corpus import stopwords, gutenberg
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('gutenberg')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/matheus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [5]:
from enum import Enum

class SpecialToken(Enum):
    START_TOKEN = '<SOT>'
    END_TOKEN = '<EOT>'
    UNK_TOKEN = '<UNK>'


In [6]:
stop_words_en = set(stopwords.words('english'))

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def normalize_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    normalized_text = " ".join(tokens)
    return normalized_text

fileid = 'shakespeare-macbeth.txt'
raw_text = gutenberg.raw(fileid)
print("Raw Text:", raw_text[500:600])
normalized_text = normalize_text(raw_text[500:600])
print("Normalized Text:", normalized_text)

Raw Text: the fogge and filthie ayre.

Exeunt.


Scena Secunda.

Alarum within. Enter King Malcome, Donalbaine
Normalized Text: the fogge and filthie ayre . exeunt . scena secunda . alarum within . enter king malcome , donalbaine


In [13]:
from collections import Counter

class NGramModel:
    def __init__(self, n):
        self.n = n
        self.vocab = set([SpecialToken.UNK_TOKEN.value, SpecialToken.START_TOKEN.value, SpecialToken.END_TOKEN.value])
        self.tokenized_corpus = []
        self.ngram_counts = Counter() 
        self.context_counts = Counter() 

    def tokenizer(self, text, padding=True):
        tokens = word_tokenize(text)
        start_tokens = [SpecialToken.START_TOKEN.value]
        if padding:
            start_tokens = start_tokens * (self.n - 1)
        return start_tokens + tokens + [SpecialToken.END_TOKEN.value]

    def create_vocab(self):
        self.vocab.update(set(self.tokenized_corpus))

    def compute_probabilities(self, candidates):
        if not isinstance(candidates, tuple):
            candidates = tuple(candidates)

        context = candidates[0][:-1]
        context_count = self.context_counts[tuple(context)]
        candidates_count = {candidate[-1]: self.ngram_counts[tuple(candidate)] for candidate in candidates}
        probabilities = {word: count / context_count if context_count > 0 else 0 for word, count in candidates_count.items()}
        return probabilities
    
    def collect_ngram_counts(self):
        for i in range(self.n - 1, len(self.tokenized_corpus)):
            ngram = tuple(self.tokenized_corpus[i - self.n + 1 : i + 1])
            context = tuple(ngram[:-1])
            self.ngram_counts[ngram] += 1
            self.context_counts[context] += 1

    def predict_next_token(self, token_sequence):
        start_context_index = max(0, len(token_sequence) - self.n +1 )
        context = token_sequence[start_context_index:]
        if len(context) < self.n - 1:
            pad = [SpecialToken.START_TOKEN.value] * (self.n - 1 - len(context))
            context = pad + context
        candidates = [context + [word] for word in self.vocab]
        probabilities = self.compute_probabilities(candidates)
        return max(probabilities, key=probabilities.get)
    
    def generate_text(self, seed_text, max_length=20):
        tokenized_seed = word_tokenize(seed_text)
        generated_tokens = tokenized_seed.copy()
        for _ in range(max_length):
            next_token = self.predict_next_token(generated_tokens)
            if next_token == SpecialToken.END_TOKEN.value:
                break
            generated_tokens.append(next_token)
        return ' '.join(generated_tokens)
    
    def fit(self, text):
        self.tokenized_corpus = self.tokenizer(text, padding=True)
        self.create_vocab()
        self.collect_ngram_counts()
        

In [15]:
model = NGramModel(3)
normalized_text = normalize_text(raw_text)
model.fit(text=normalized_text)
print(model.predict_next_token(word_tokenize("the tragedie")))
model.generate_text("the tragedie", max_length=50)

of


'the tragedie of macbeth . macb . i , and the dead , and the dead , and the dead , and the dead , and the dead , and the dead , and the dead , and the dead , and the dead , and the dead , and the dead'