# Model

In [3]:
import pandas as pd
import numpy as np
import re
import random
from collections import defaultdict
from tqdm import tqdm

# Load Reuters dataset
dialogs = pd.read_csv("sample_reuters_dataset.csv")

# Clean the dataset
def clean_text(text):
    text = re.sub("[^a-zA-Z' ]", "", text)  # Keep only letters, spaces, and apostrophes
    return text.lower()

# Clean all sentence_text in the dialogs dataframe
dialogs['cleaned_text'] = dialogs['sentence_text'].apply(clean_text)

# creating the vocabulary
# get list of all the words
all_words = " ".join(dialogs['cleaned_text']).split()
words_dict = {}
# add word-count pair to the dictionary
for word in all_words:
    # check if the word is already in dictionary
    if word in words_dict:
        # increment count of word by 1
        words_dict[word] = words_dict[word] + 1
    else:
        # add the word to dictionary with count 1
        words_dict[word] = 1


# Create N-grams functions
def create_ngram(sentence, n):
    tokens = sentence.split()
    return [tokens[i:i + n] for i in range(len(tokens) - n + 1)]

# Create dataframe with cleaned sentences and N-grams
dataset = pd.DataFrame({'Sentences': dialogs['cleaned_text']})
dataset['unigram'] = dataset['Sentences'].apply(lambda x: create_ngram(x, 1))
dataset['bigram'] = dataset['Sentences'].apply(lambda x: create_ngram(x, 2))
dataset['trigram'] = dataset['Sentences'].apply(lambda x: create_ngram(x, 3))

# Build N-gram model (trigrams in this case)
model = defaultdict(lambda: defaultdict(lambda: 0))
for trigrams in dataset['trigram']:
    for w1, w2, w3 in trigrams:
        model[(w1, w2)][w3] += 1

# Convert counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

# Function to predict the next word based on the trigram model
def predict_next_word(w1, w2, model):
    if (w1, w2) in model:
        return max(model[(w1, w2)], key=model[(w1, w2)].get)
    else:
        return "No prediction available"

print(dict(model["am", "concerned"]))


{'and': 0.5, 'that': 0.5}


In [4]:
# Create vocabulary
vocabulary = set(" ".join(dialogs['cleaned_text']).split())
print(f"Vocabulary size: {len(vocabulary)}")
print(f"Vocabulary: {vocabulary}")


Vocabulary size: 12580


In [5]:
dataset.head()

Unnamed: 0,Sentences,unigram,bigram,trigram
0,asian exporters fear damage from u s japan r...,"[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,...","[[asian, exporters, fear], [exporters, fear, d..."
1,they told reuter correspondents in asian capit...,"[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres...","[[they, told, reuter], [told, reuter, correspo..."
2,but some exporters said that while the conflic...,"[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s...","[[but, some, exporters], [some, exporters, sai..."
3,the u s has said it will impose mln dlrs of...,"[[the], [u], [s], [has], [said], [it], [will],...","[[the, u], [u, s], [s, has], [has, said], [sai...","[[the, u, s], [u, s, has], [s, has, said], [ha..."
4,unofficial japanese estimates put the impact o...,"[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]...","[[unofficial, japanese, estimates], [japanese,..."


# Evaluation

In [6]:
def evaluate_accuracy(model, test_sentences):
    correct = 0
    total = 0

    for sentence in test_sentences:
        words = sentence.split()
        for i in range(len(words) - 2):
            w1, w2, actual_next = words[i], words[i+1], words[i+2]
            predicted_next = predict_next_word(w1, w2, model)
            if predicted_next == actual_next:
                correct += 1
            total += 1

    return correct / total if total > 0 else 0

# Example test sentences
test_sentences = ["global economy is strong","i have been","we are still","they will be"]

print("Prediction Accuracy:", evaluate_accuracy(model, test_sentences))

Prediction Accuracy: 0.8


In [7]:
import numpy as np

def calculate_perplexity(model, test_sentences):
    total_log_prob = 0
    total_words = 0

    for sentence in test_sentences:
        words = sentence.split()
        trigrams = create_ngram(sentence, 3)
        sentence_prob = 1

        for w1, w2, w3 in trigrams:
            if (w1, w2) in model and w3 in model[(w1, w2)]:
                sentence_prob *= model[(w1, w2)][w3]
            else:
                sentence_prob *= 1e-6  # Smoothing for unseen words

        total_log_prob += np.log(sentence_prob)
        total_words += len(words)

    perplexity = np.exp(-total_log_prob / total_words)
    return perplexity

# Example test sentences
test_data = ["stock market is rising", "global economy is improving"]
print("Perplexity:", calculate_perplexity(model, test_data))

Perplexity: 177.82794100389225


In [8]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

# Function to predict the next word (replace with your actual model logic)
def predict_next_word(w1, w2, model):
    # Check if the bigram (w1, w2) exists in the model
    if (w1, w2) in model:
        # Get the most likely next word based on probabilities
        return max(model[(w1, w2)], key=model[(w1, w2)].get)
    else:
        # Return "unknown" if the bigram is not found
        return "unknown"

def evaluate_bleu(model, reference_sentences):
    scores = []
    smoothie = SmoothingFunction().method1  # Apply smoothing

    for sentence in reference_sentences:
        words = sentence.lower().split()

        for i in range(len(words) - 2):  # Loop through bigrams
            w1, w2, actual_next_word = words[i], words[i + 1], words[i + 2]
            predicted_word = predict_next_word(w1, w2, model)  # Use the actual predict_next_word function

            if predicted_word == "unknown":
                continue  # Skip if model fails to predict

            generated_tokens = [w1, w2, predicted_word]  # Model's output
            reference_tokens = [[w1, w2, actual_next_word]]  # Expected

            # Calculate BLEU for this prediction (unigrams & bigrams only)
            score = sentence_bleu(reference_tokens, generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
            scores.append(score)

    return np.mean(scores) if scores else 0  # Avoid division by zero

# Example reference sentences
reference_data = ["Market volatility impacts investor confidence", "Economic growth remains a key priority"]

# Convert defaultdict to a normal dictionary before passing to evaluation
model_dict = {tuple(k): dict(v) for k, v in model.items()}  # Convert keys to tuples

# Run evaluation using the converted model
print("BLEU Score:", evaluate_bleu(model_dict, reference_data))

BLEU Score: 0.5773502691896257


# Testing

In [24]:
# another example
dict(model["how", "are"])

{'european': 1.0}

In [25]:
# another example
dict(model["good", "to"])

{'go': 0.3333333333333333,
 'have': 0.3333333333333333,
 'very': 0.3333333333333333}

In [26]:
dict(model["to", "very"])

{'good': 1.0}

In [27]:
dict(model["i", "have"])

{'nothing': 0.16666666666666666,
 'been': 0.3333333333333333,
 'mixed': 0.16666666666666666,
 'to': 0.16666666666666666,
 'talked': 0.16666666666666666}

# Saving

In [28]:
import pickle

# Convert defaultdict to normal dict
model_dict = {k: dict(v) for k, v in model.items()}

# Save model
with open("trigram_model.pkl", "wb") as f:
    pickle.dump(model_dict, f)

print("Model saved successfully!")


Model saved successfully!
