## NLP: Lab 5 (bag-of-words)

### Task 0. Build an N-gram language model based on some corpus.

In [None]:
import nltk
from nltk.util import ngrams
from collections import Counter, defaultdict

# Зразковий корпус
corpus = [
    "this is a sample sentence",
    "this is another example sentence",
    "this is yet another example"
]

# Токенізація корпусу
tokenized_corpus = [nltk.word_tokenize(sentence) for sentence in corpus]

# Функція для побудови N-грамної моделі
def build_ngram_model(tokenized_corpus, n):
    ngrams_list = []
    for sentence in tokenized_corpus:
        ngrams_list.extend(list(ngrams(sentence, n)))
    
    ngram_freq = Counter(ngrams_list)
    return ngram_freq

# Побудова біграмної моделі (2-грам)
bigram_model = build_ngram_model(tokenized_corpus, 2)
print(bigram_model)

Counter({('this', 'is'): 3, ('another', 'example'): 2, ('is', 'a'): 1, ('a', 'sample'): 1, ('sample', 'sentence'): 1, ('is', 'another'): 1, ('example', 'sentence'): 1, ('is', 'yet'): 1, ('yet', 'another'): 1})


### Task 1. Compare bi- and tri-gram models

In [2]:
# Build a trigram model (3-gram)
trigram_model = build_ngram_model(tokenized_corpus, 3)
print("Trigram Model:", trigram_model)

# Compare bigram and trigram models
print("Bigram Model:", bigram_model)
print("Trigram Model:", trigram_model)

Trigram Model: Counter({('this', 'is', 'a'): 1, ('is', 'a', 'sample'): 1, ('a', 'sample', 'sentence'): 1, ('this', 'is', 'another'): 1, ('is', 'another', 'example'): 1, ('another', 'example', 'sentence'): 1, ('this', 'is', 'yet'): 1, ('is', 'yet', 'another'): 1, ('yet', 'another', 'example'): 1})
Bigram Model: Counter({('this', 'is'): 3, ('another', 'example'): 2, ('is', 'a'): 1, ('a', 'sample'): 1, ('sample', 'sentence'): 1, ('is', 'another'): 1, ('example', 'sentence'): 1, ('is', 'yet'): 1, ('yet', 'another'): 1})
Trigram Model: Counter({('this', 'is', 'a'): 1, ('is', 'a', 'sample'): 1, ('a', 'sample', 'sentence'): 1, ('this', 'is', 'another'): 1, ('is', 'another', 'example'): 1, ('another', 'example', 'sentence'): 1, ('this', 'is', 'yet'): 1, ('is', 'yet', 'another'): 1, ('yet', 'another', 'example'): 1})


### Task 2. Apply interpolation/backoff to your model so that it can better handle unknown words/prompts.

In [3]:
# Function to build unigram model
def build_unigram_model(tokenized_corpus):
    unigrams_list = [token for sentence in tokenized_corpus for token in sentence]
    unigram_freq = Counter(unigrams_list)
    return unigram_freq

# Build unigram model
unigram_model = build_unigram_model(tokenized_corpus)

# Function to calculate interpolated probability
def interpolated_prob(word_sequence, unigram_model, bigram_model, trigram_model, lambda1=0.1, lambda2=0.3, lambda3=0.6):
    if len(word_sequence) == 1:
        return unigram_model[word_sequence[0]] / sum(unigram_model.values())
    elif len(word_sequence) == 2:
        unigram_prob = unigram_model[word_sequence[1]] / sum(unigram_model.values())
        bigram_prob = bigram_model[word_sequence] / sum(bigram_model.values())
        return lambda1 * unigram_prob + lambda2 * bigram_prob
    elif len(word_sequence) == 3:
        unigram_prob = unigram_model[word_sequence[2]] / sum(unigram_model.values())
        bigram_prob = bigram_model[word_sequence[1:]] / sum(bigram_model.values())
        trigram_prob = trigram_model[word_sequence] / sum(trigram_model.values())
        return lambda1 * unigram_prob + lambda2 * bigram_prob + lambda3 * trigram_prob

# Example usage
word_sequence = ('this', 'is', 'a')
prob = interpolated_prob(word_sequence, unigram_model, bigram_model, trigram_model)
print(f"Interpolated probability for {word_sequence}: {prob}")

Interpolated probability for ('this', 'is', 'a'): 0.09833333333333333


### Task 3. Use this model to build sentences. Meaning, for a prompt consisting of words p_1, ..., p_n, it should produce a continuation w_1, ..., w_k.

In [5]:
import random

def generate_sentence(prompt, unigram_model, bigram_model, trigram_model, max_length=10):
    sentence = list(prompt)
    # Generate the continuation of the sentence
    while len(sentence) < max_length:
        if len(sentence) >= 2:
            trigram_candidates = [(trigram, prob) for trigram, prob in trigram_model.items() if trigram[:2] == tuple(sentence[-2:])]
            if trigram_candidates:
                next_word = max(trigram_candidates, key=lambda x: x[1])[0][2]
                sentence.append(next_word)
                continue
        
        if len(sentence) >= 1:
            bigram_candidates = [(bigram, prob) for bigram, prob in bigram_model.items() if bigram[0] == sentence[-1]]
            if bigram_candidates:
                next_word = max(bigram_candidates, key=lambda x: x[1])[0][1]
                sentence.append(next_word)
                continue
        
        unigram_candidates = [(unigram, prob) for unigram, prob in unigram_model.items()]
        next_word = max(unigram_candidates, key=lambda x: x[1])[0]
        sentence.append(next_word)
    
    return ' '.join(sentence)

# Example usage
prompt = ('this', 'is')
generated_sentence = generate_sentence(prompt, unigram_model, bigram_model, trigram_model)
print(f"Generated sentence: {generated_sentence}")

Generated sentence: this is a sample sentence this is a sample sentence
