## Steps to build the next word recommender system

1. Loading and exploring the dataset
2. Creating N-grams of the dialogue
3. Building the N-gram Language Model
4. Predicting the next word using N-gram Language Model

## 1. Loading and exploring the dataset

In [10]:
# loading the required libraries
import pandas as pd
import numpy as np
import re
import pickle
import random
from tqdm import tqdm

In [None]:
import pandas as pd
import numpy as np
import re
import random
from collections import defaultdict
from tqdm import tqdm

# Load Reuters dataset
dialogs = pd.read_csv("sample_reuters_dataset.csv")

# Clean the dataset
def clean_text(text):
    text = re.sub("[^a-zA-Z' ]", "", text)  # Keep only letters, spaces, and apostrophes
    return text.lower()

# Clean all sentence_text in the dialogs dataframe
dialogs['cleaned_text'] = dialogs['sentence_text'].apply(clean_text)

# creating the vocabulary
# get list of all the words
all_words = " ".join(dialogs['cleaned_text']).split()
words_dict = {}
# add word-count pair to the dictionary
for word in all_words:
    # check if the word is already in dictionary
    if word in words_dict:
        # increment count of word by 1
        words_dict[word] = words_dict[word] + 1
    else:
        # add the word to dictionary with count 1
        words_dict[word] = 1


# Create N-grams functions
def create_ngram(sentence, n):
    tokens = sentence.split()
    return [tokens[i:i + n] for i in range(len(tokens) - n + 1)]

# Create dataframe with cleaned sentences and N-grams
dataset = pd.DataFrame({'Sentences': dialogs['cleaned_text']})
dataset['unigram'] = dataset['Sentences'].apply(lambda x: create_ngram(x, 1))
dataset['bigram'] = dataset['Sentences'].apply(lambda x: create_ngram(x, 2))
dataset['trigram'] = dataset['Sentences'].apply(lambda x: create_ngram(x, 3))

# Build N-gram model (trigrams in this case)
model = defaultdict(lambda: defaultdict(lambda: 0))
for trigrams in dataset['trigram']:
    for w1, w2, w3 in trigrams:
        model[(w1, w2)][w3] += 1

# Convert counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

# Function to predict the next word based on the trigram model
def predict_next_word(w1, w2, model):
    if (w1, w2) in model:
        return max(model[(w1, w2)], key=model[(w1, w2)].get)
    else:
        return "No prediction available"

# Example predictions
# print(predict_next_word("stock", "market", model))
# print(predict_next_word("global", "economy", model))

print(dict(model["am", "concerned"]))


{'and': 0.5, 'that': 0.5}


In [None]:
# Create vocabulary
vocabulary = set(" ".join(dialogs['cleaned_text']).split())
print(f"Vocabulary size: {len(vocabulary)}")
print(f"Vocabulary: {vocabulary}")


In [None]:
dataset.head()

In [80]:
# another example
dict(model["how", "are"])

{'european': 1.0}

In [81]:
# another example
dict(model["good", "to"])

{'go': 0.3333333333333333,
 'have': 0.3333333333333333,
 'very': 0.3333333333333333}

In [82]:
dict(model["to", "very"])

{'good': 1.0}

In [84]:
dict(model["i", "have"])

{'nothing': 0.16666666666666666,
 'been': 0.3333333333333333,
 'mixed': 0.16666666666666666,
 'to': 0.16666666666666666,
 'talked': 0.16666666666666666}

In [85]:
import pickle

# Convert defaultdict to normal dict
model_dict = {k: dict(v) for k, v in model.items()}

# Save model
with open("trigram_model.pkl", "wb") as f:
    pickle.dump(model_dict, f)

print("Model saved successfully!")


Model saved successfully!
