## Steps to build the next word recommender system

1. Loading and exploring the dataset
2. Creating N-grams of the dialogue
3. Building the N-gram Language Model
4. Predicting the next word using N-gram Language Model

## 1. Loading and exploring the dataset

In [101]:
# loading the required libraries
import pandas as pd
import numpy as np
import re
import pickle
import random
from tqdm import tqdm

In [156]:
import pandas as pd
import numpy as np
import re
import random
from collections import defaultdict
from tqdm import tqdm

# Load Reuters dataset
dialogs = pd.read_csv("sample_reuters_dataset.csv")

# Clean the dataset
# def clean_text(text):
#     text = re.sub("[^a-zA-Z' ]", "", text)
#     return text.lower()
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9$%,.' ]", "", text)  # Keep numbers, $, %, ., , and '
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text



# Clean all sentence_text in the dialogs dataframe
dialogs['cleaned_text'] = dialogs['sentence_text'].apply(clean_text)

# creating the vocabulary
# get list of all the words
all_words = " ".join(dialogs['cleaned_text']).split()
words_dict = {}
# add word-count pair to the dictionary
for word in all_words:
    # check if the word is already in dictionary
    if word in words_dict:
        # increment count of word by 1
        words_dict[word] = words_dict[word] + 1
    else:
        # add the word to dictionary with count 1
        words_dict[word] = 1


# Create N-grams functions
def create_ngram(sentence, n):
    tokens = sentence.split()
    return [tokens[i:i + n] for i in range(len(tokens) - n + 1)]

# Create dataframe with cleaned sentences and N-grams
dataset = pd.DataFrame({'Sentences': dialogs['cleaned_text']})

# Build N-gram model (trigrams in this case)
model = defaultdict(lambda: defaultdict(lambda: 0))
# for trigrams in dataset['trigram']:
#     for w1, w2, w3 in trigrams:
#         model[(w1, w2)][w3] += 1

# Convert counts to probabilities
# for w1_w2 in model:
#     total_count = float(sum(model[w1_w2].values()))
#     for w3 in model[w1_w2]:
#         model[w1_w2][w3] /= total_count

n_values = [2, 3, 4, 5]  # Bigrams to 5-grams
ngram_models = {}

for n in n_values:
    dataset[f'{n}-gram'] = dataset['Sentences'].apply(lambda x: create_ngram(x, n))
    ngram_models[n] = defaultdict(lambda: defaultdict(lambda: 0))

    # Build the model for each n-gram size
    for ngrams in dataset[f'{n}-gram']:
        for words in ngrams:
            context = tuple(words[:-1])  # Convert list to tuple
            next_word = words[-1]
            ngram_models[n][context][next_word] += 1



# Convert counts to probabilities
for n in ngram_models:
    for context in ngram_models[n]:
        total_count = float(sum(ngram_models[n][context].values()))
        for next_word in ngram_models[n][context]:
            ngram_models[n][context][next_word] /= total_count

# Function to predict the next word based on the trigram model
# def predict_next_word(w1, w2, model):
#     if (w1, w2) in model:
#         return max(model[(w1, w2)], key=model[(w1, w2)].get)
#     else:
#         return "No prediction available"

# Tries the longest n-gram first, then falls back to smaller ones.
def predict_next_word(context, models):
    # Tokenize the context if it's not already tokenized
    if isinstance(context, str):
        context = context.split()

    context_length = len(context)

    for n in sorted(models.keys(), reverse=True):  # Start with largest n-gram
        if context_length >= n - 1:
            ngram_context = tuple(context[-(n-1):])  # Get the correct context size
            print(f"Trying {n}-gram context: {ngram_context}")

            if ngram_context in models[n]:
                print(f"Possible next words: {models[n][ngram_context]}")
                return max(models[n][ngram_context], key=models[n][ngram_context].get)
    
    return "No prediction available"

# Example predictions
# print(predict_next_word("stock", "market", model))
print(predict_next_word("you have a", ngram_models))

# print(dict(model["but", "some"]))


Trying 4-gram context: ('you', 'have', 'a')
Trying 3-gram context: ('have', 'a')
Possible next words: defaultdict(<function <lambda>.<locals>.<lambda> at 0x000002476FD5F1C0>, {'response': 0.034482758620689655, 'lot': 0.034482758620689655, 'material': 0.034482758620689655, 'small': 0.034482758620689655, 'turnover': 0.034482758620689655, '3': 0.06896551724137931, 'third': 0.034482758620689655, 'substantial': 0.034482758620689655, 'credible': 0.034482758620689655, 'production': 0.034482758620689655, 'chance': 0.034482758620689655, 'maturity': 0.034482758620689655, 'more': 0.06896551724137931, 'major': 0.06896551724137931, '25': 0.034482758620689655, 'registered': 0.034482758620689655, 'loss': 0.034482758620689655, 'positive': 0.06896551724137931, 'forward': 0.034482758620689655, 'hard': 0.034482758620689655, 'fairly': 0.034482758620689655, 'draw': 0.034482758620689655, 'different': 0.06896551724137931, 'relationship': 0.034482758620689655})
3


In [157]:
print(predict_next_word("are concerned about you and have", ngram_models))

Trying 5-gram context: ('about', 'you', 'and', 'have')
Trying 4-gram context: ('you', 'and', 'have')
Trying 3-gram context: ('and', 'have')
Possible next words: defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000247542F1CF0>, {'settled': 0.25, 'improved': 0.5, 'believed': 0.25})
improved


In [103]:
# Create vocabulary
vocabulary = set(" ".join(dialogs['cleaned_text']).split())
print(f"Vocabulary size: {len(vocabulary)}")
print(f"Vocabulary: {vocabulary}")


Vocabulary size: 13881


In [104]:
dataset.head()

Unnamed: 0,Sentences,unigram,bigram,trigram
0,asian exporters fear damage from u . s .- japa...,"[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,...","[[asian, exporters, fear], [exporters, fear, d..."
1,they told reuter correspondents in asian capit...,"[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres...","[[they, told, reuter], [told, reuter, correspo..."
2,but some exporters said that while the conflic...,"[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s...","[[but, some, exporters], [some, exporters, sai..."
3,the u . s . has said it will impose 300 mln dl...,"[[the], [u], [.], [s], [.], [has], [said], [it...","[[the, u], [u, .], [., s], [s, .], [., has], [...","[[the, u, .], [u, ., s], [., s, .], [s, ., has..."
4,unofficial japanese estimates put the impact o...,"[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]...","[[unofficial, japanese, estimates], [japanese,..."


In [105]:
# another example
dict(model["how", "are"])

{'european': 1.0}

In [106]:
# another example
dict(model["good", "to"])

{'go': 0.3333333333333333,
 'have': 0.3333333333333333,
 'very': 0.3333333333333333}

In [107]:
dict(model["to", "very"])

{'good': 1.0}

In [108]:
dict(model["i", "have"])

{'nothing': 0.16666666666666666,
 'been': 0.3333333333333333,
 'mixed': 0.16666666666666666,
 'to': 0.16666666666666666,
 'talked': 0.16666666666666666}

In [None]:
# import pickle

# # Convert defaultdict to normal dict
# model_dict = {k: dict(v) for k, v in model.items()}

# # Save model
# with open("trigram_model.pkl", "wb") as f:
#     pickle.dump(model_dict, f)

# print("Model saved successfully!")


Model saved successfully!


In [158]:
import pickle

# Convert defaultdicts to regular dicts
ngram_models_cleaned = {
    n: {k: dict(v) for k, v in model.items()}
    for n, model in ngram_models.items()
}

# Now save it
with open('ngram_model.pkl', 'wb') as f:
    pickle.dump(ngram_models_cleaned, f)

print("N-gram models saved successfully!")


N-gram models saved successfully!


In [115]:
from collections import defaultdict

def build_smoothed_ngram_model(dataset):
    model = defaultdict(lambda: defaultdict(lambda: 1))  # Laplace smoothing (start with count=1)
    
    for trigrams in dataset['trigram']:
        for w1, w2, w3 in trigrams:
            model[(w1, w2)][w3] += 1
    
    # Convert counts to probabilities
    for w1_w2 in model:
        total_count = float(sum(model[w1_w2].values()))
        for w3 in model[w1_w2]:
            model[w1_w2][w3] /= total_count

    return model

smoothed_model = build_smoothed_ngram_model(dataset)


In [117]:
import math

def calculate_perplexity(test_sentences, model):
    total_log_prob = 0
    word_count = 0

    for sentence in test_sentences:
        trigrams = create_ngram(sentence, 3)
        for w1, w2, w3 in trigrams:
            prob = model[(w1, w2)].get(w3, 1e-10)
            total_log_prob += math.log(prob)
            word_count += 1

    perplexity = math.exp(-total_log_prob / word_count)
    return perplexity

# Example usage:
test_sentences = ["But some", "they told"]
test_sentences = [clean_text(s) for s in test_sentences]
print(calculate_perplexity(test_sentences, model))


ZeroDivisionError: division by zero

In [112]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu(reference_sentences, generated_sentence):
    reference_tokens = [s.split() for s in reference_sentences]
    generated_tokens = generated_sentence.split()
    return sentence_bleu(reference_tokens, generated_tokens)

# Example usage:
reference = ["the stock market is volatile", "the stock market is unpredictable"]
generated = "the stock market is falling"
print(calculate_bleu(reference, generated))


0.668740304976422
