### Install libraries

In [1]:
from IPython.display import clear_output

!pip install datasets
!pip install nltk
clear_output(wait=False)

### Dataset

I chose dataset [karpathy/tiny_shakespeare](https://huggingface.co/datasets/karpathy/tiny_shakespeare)

Dataset loading

In [2]:
import datasets
from datasets import load_dataset

ds = load_dataset("karpathy/tiny_shakespeare", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [4]:
data = ds['train'][0]['text']
len(data)

1003854

Preprocessing and tokenization

In [5]:
import nltk
from nltk.tokenize import word_tokenize
import re

nltk.download('punkt')


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = word_tokenize(text)
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\milya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
tokens = preprocess_text(data)
print(tokens[:20])

['first', 'citizen', 'before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', 'all', 'speak', 'speak', 'first', 'citizen', 'you', 'are', 'all', 'resolved', 'rather']


Generating N-grams

In [7]:
def generate_ngrams(tokens, n):
    tokens = ['<s>'] * (n-1) + tokens + ['</s>']
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    return ngrams

trigrams = generate_ngrams(tokens, 3)
print(trigrams[:5])

[('<s>', '<s>', 'first'), ('<s>', 'first', 'citizen'), ('first', 'citizen', 'before'), ('citizen', 'before', 'we'), ('before', 'we', 'proceed')]


Training N-gram model

In [8]:
from collections import defaultdict, Counter

def train_ngram_model(text, n):
    ngram_counts = defaultdict(Counter)
    tokens = preprocess_text(text)
    ngrams = generate_ngrams(tokens, n)
    
    for ngram in ngrams:
        prefix = tuple(ngram[:-1])
        word = ngram[-1]
        ngram_counts[prefix][word] += 1

    ngram_probs = {
        prefix: {word: count / sum(counts.values())
                 for word, count in counts.items()}
        for prefix, counts in ngram_counts.items()
    }
    return ngram_probs

In [9]:
trigram_model = train_ngram_model(data, 3)

print(trigram_model.get(("the", "great"), {}))

{'toe': 0.1875, 'aufidius': 0.0625, 'king': 0.0625, 'rich': 0.0625, 'chamber': 0.0625, 'lord': 0.0625, 'commanding': 0.0625, 'apollo': 0.125, 'comfort': 0.0625, 'pompey': 0.0625, 'soldier': 0.0625, 'traveller': 0.0625, 'desire': 0.0625}


Handling unseen N-grams with Laplace smoothing

In [10]:
def laplace_smoothing(model, vocab_size, alpha=1):
    smoothed_model = {}
    
    for prefix, words in model.items():
        smoothed_model[prefix] = {
            word: (count + alpha) / (sum(words.values()) + alpha * vocab_size)
            for word, count in words.items()
        }
    
    return smoothed_model

In [11]:
vocab_size = len(set(word for counts in trigram_model.values() for word in counts))
trigram_model_smoothed = laplace_smoothing(trigram_model, vocab_size)

print(trigram_model_smoothed.get(("the", "great"), {}))

{'toe': 9.835997680775283e-05, 'aufidius': 8.80062950385157e-05, 'king': 8.80062950385157e-05, 'rich': 8.80062950385157e-05, 'chamber': 8.80062950385157e-05, 'lord': 8.80062950385157e-05, 'commanding': 8.80062950385157e-05, 'apollo': 9.318313592313427e-05, 'comfort': 8.80062950385157e-05, 'pompey': 8.80062950385157e-05, 'soldier': 8.80062950385157e-05, 'traveller': 8.80062950385157e-05, 'desire': 8.80062950385157e-05}


Saving the model

In [12]:
import pickle

with open(r"models/trigram_model_shakespeare.pkl", "wb") as f:
    pickle.dump(trigram_model_smoothed, f)
print("model saved")

model saved


Checking the model loading

In [13]:
with open(r"models/trigram_model_shakespeare.pkl", "rb") as f:
    loaded_model = pickle.load(f)

print("model loaded")
print(loaded_model.get(("the", "great"), {}))

model loaded
{'toe': 9.835997680775283e-05, 'aufidius': 8.80062950385157e-05, 'king': 8.80062950385157e-05, 'rich': 8.80062950385157e-05, 'chamber': 8.80062950385157e-05, 'lord': 8.80062950385157e-05, 'commanding': 8.80062950385157e-05, 'apollo': 9.318313592313427e-05, 'comfort': 8.80062950385157e-05, 'pompey': 8.80062950385157e-05, 'soldier': 8.80062950385157e-05, 'traveller': 8.80062950385157e-05, 'desire': 8.80062950385157e-05}
