# Texto to N-gram

In [1]:
def n_gram(text, n_grams=1):
    sequence = text.split()
    n_gram_list = []
    for i in range(len(sequence) - n_grams + 1):
        n_gram_list.append(' '.join(sequence[i:i + n_grams]))
    return n_gram_list

n_gram("I am an IA student.", 1)

['I', 'am', 'an', 'IA', 'student.']

### Train a n-gram model

In [2]:
!pip install nltk



In [3]:
import contextlib
import os
import re
import string
import nltk
from nltk.corpus import stopwords, gutenberg
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('gutenberg')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/matheus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [4]:
stop_words_en = set(stopwords.words('english'))

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def normalize_text(text):
    text = text.lower()
    text = remove_punctuation(text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = " ".join(text.split())
    tokens = word_tokenize(text)
    tokens =  [word for word in tokens if word not in stop_words_en and word.isalpha()] 
    normalized_text = " ".join(tokens)
    return normalized_text

fileid = 'shakespeare-macbeth.txt'
raw_text = gutenberg.raw(fileid)
print("Raw Text:", raw_text[500:600])
normalized_text = normalize_text(raw_text[500:600])
print("Normalized Text:", normalized_text)

Raw Text: the fogge and filthie ayre.

Exeunt.


Scena Secunda.

Alarum within. Enter King Malcome, Donalbaine
Normalized Text: fogge filthie ayre exeunt scena secunda alarum within enter king malcome donalbaine


In [88]:
from collections import Counter

class NGramModel:
    def __init__(self, n):
        self.n = n
        self.ngrams = []
        self.context = []
        self.vocab = set(['<SOT>', '<EOT>'])
        self.tokenized_corpus = []

    def tokenizer(self, text):
        return word_tokenize(text) 

    def add_special_tokens(self, tokens):
        return ['<SOT>'] + tokens + ['<EOT>']
    
    def add_padding(self, tokens):
        return ['<SOT>']*(self.n - 1) + tokens + ['<EOT>']

    def create_ngrams(self):
        for i in range(self.n-1, len(self.tokenized_corpus)):
            self.ngrams.append(' '.join(self.tokenized_corpus[i - self.n + 1:i + 1]))
            self.context.append(' '.join(self.tokenized_corpus[i - self.n + 1:i]))

    def create_vocab(self):
        self.vocab.update(set(self.tokenized_corpus))

    def calculate_probabilities_of_vocab(self, token_sequence):
        context = ' '.join(token_sequence[-self.n+1:])
        context_frequency = self.tokenized_corpus.count(context)
        context_and_vocab_candidates = [f"{context} {word}" for word in self.vocab]
        context_and_vocab_candidates_frequency = [self.ngrams.count(candidate) for candidate in context_and_vocab_candidates]
        probabilities = []
        for cadidate in context_and_vocab_candidates_frequency:
            probabilities.append(cadidate / context_frequency if context_frequency > 0 else 0)
        return probabilities
    
    def predict_next_token(self, token_sequence):
        probabilities = self.calculate_probabilities_of_vocab(token_sequence)
        idx_max = probabilities.index(max(probabilities))
        return list(self.vocab)[idx_max], probabilities[idx_max]

    def generate_text(self, seed_text, max_length=20):
        tokenized_seed = self.tokenizer(seed_text)
        generated_tokens = tokenized_seed.copy()
        for _ in range(max_length):
            next_token, _ = self.predict_next_token(generated_tokens)
            generated_tokens.append(next_token)
            if next_token == '<EOT>':
                break
        return ' '.join(generated_tokens)
    
    def train(self, text):
        self.tokenized_corpus = self.add_special_tokens(self.tokenizer(text))
        self.create_ngrams()
        self.create_vocab()


In [89]:
model = NGramModel(2)

normalized_text = normalize_text(raw_text)
model.train(text=normalized_text)
model.predict_next_token(model.tokenizer("tragedie macbeth william"))
model.generate_text("tragedie macbeth william", max_length=10)

'tragedie macbeth william shakespeare actus primus scoena prima enter macbeth macb haue done'