### Install libraries

In [1]:
from IPython.display import clear_output

!pip install datasets
!pip install nltk
clear_output(wait=False)

### Dataset

I chose subset of dataset [bookcorpus/bookcorpus](https://huggingface.co/datasets/bookcorpus/bookcorpus)

Dataset loading

In [None]:
from datasets import load_dataset

ds = load_dataset("bookcorpus/bookcorpus", trust_remote_code=True)

In [None]:
ds

In [None]:
ds['train'][0]['text']

In [None]:
from tqdm import tqdm

size=50*10**6
data = "".join(ds['train'][i]['text'] for i in tqdm(range(size)))
print(data[:100])

In [None]:
with open(r"..\data\bookcorpus.txt", "w", encoding="utf-8") as f:
    f.write(data)

In [3]:
with open(r"..\data\bookcorpus.txt", "r", encoding="utf-8") as f:
    data = f.read()

In [4]:
len(data)

301773272

Preprocessing and tokenization

In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm

nltk.download('punkt')
tokenizer = RegexpTokenizer(r'\w+')


def preprocess_text(text, chunk_size=10**6):
    total_chunks = len(text) // chunk_size + (1 if len(text) % chunk_size else 0)
    for i in tqdm(range(0, len(text), chunk_size), total=total_chunks):
        chunk = text[i:i + chunk_size].lower()
        chunk = chunk.translate(str.maketrans("", "", r"""!@#$%^&*()_+-={}[]:";'<>,.?/|\~`"""))
        yield tokenizer.tokenize(chunk)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\milya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
tokens = []
for token_chunk in preprocess_text(data):
    tokens.extend(token_chunk)

print(tokens[:20])

In [None]:
len(tokens)

Generating N-grams

In [5]:
def generate_ngrams(tokens, n):
    tokens = ['<s>'] * (n-1) + tokens + ['</s>']
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    return ngrams

In [None]:
trigrams = generate_ngrams(tokens, 3)
print(trigrams[:5])

In [None]:
import pickle

with open(r"..\data\trigrams.pkl", "wb") as f:
    pickle.dump(trigrams, f)

In [None]:
len(trigrams)

In [None]:
import pickle

with open(r"..\data\trigrams.pkl", "rb") as f:
    loaded_trigrams = pickle.load(f)

print(loaded_trigrams[:5])

[('<s>', '<s>', 'usually'), ('<s>', 'usually', 'he'), ('usually', 'he', 'would'), ('he', 'would', 'be'), ('would', 'be', 'tearing')]


In [7]:
len(loaded_trigrams)

55484815

### N-gram model

Training

In [8]:
from collections import defaultdict, Counter

def train_ngram_model(text, n):
    ngram_counts = defaultdict(Counter)

    tokens = []
    for token_chunk in preprocess_text(text):
        tokens.extend(token_chunk)
    ngrams = generate_ngrams(tokens, n)
    
    for ngram in tqdm(ngrams):
        prefix = tuple(ngram[:-1])
        word = ngram[-1]
        ngram_counts[prefix][word] += 1

    ngram_probs = {
        prefix: {word: count / sum(counts.values())
                 for word, count in counts.items()}
        for prefix, counts in ngram_counts.items()
    }
    return ngram_probs

In [9]:
trigram_model = train_ngram_model(data, 3)

print(trigram_model.get(("the", "great"), {}))

100%|██████████| 302/302 [00:16<00:00, 18.07it/s]
100%|██████████| 55484815/55484815 [38:26<00:00, 24057.55it/s]  


{'philosophers': 0.0009165902841429881, 'room': 0.17415215398716774, 'prince': 0.0018331805682859762, 'and': 0.012832263978001834, 'man': 0.01008249312557287, 'bonus': 0.0009165902841429881, 'wall': 0.01008249312557287, 'distractor': 0.0009165902841429881, 'seducer': 0.0009165902841429881, 'success': 0.0018331805682859762, 'breakfast': 0.0009165902841429881, 'shoe': 0.0009165902841429881, 'unknown': 0.01008249312557287, 'halls': 0.0009165902841429881, 'thing': 0.012832263978001834, 'granddaughter': 0.0009165902841429881, 'part': 0.002749770852428964, 'reed': 0.0009165902841429881, 'dane': 0.0009165902841429881, 'lebron': 0.0009165902841429881, 'news': 0.008249312557286892, 'state': 0.00458295142071494, 'view': 0.0018331805682859762, 'francesco': 0.0018331805682859762, 'qualities': 0.0009165902841429881, 'times': 0.002749770852428964, 'sam': 0.0009165902841429881, 'hall': 0.14298808432630614, 'forbidding': 0.0009165902841429881, 'white': 0.002749770852428964, 'black': 0.0018331805682859

Handling unseen n-grams with Laplace smoothing

In [10]:
def laplace_smoothing(model, vocab_size, alpha=1):
    smoothed_model = {}
    
    for prefix, words in model.items():
        smoothed_model[prefix] = {
            word: (count + alpha) / (sum(words.values()) + alpha * vocab_size)
            for word, count in words.items()
        }
    
    return smoothed_model

In [11]:
vocab_size = len(set(word for counts in trigram_model.values() for word in counts))
trigram_model_smoothed = laplace_smoothing(trigram_model, vocab_size)

print(trigram_model_smoothed.get(("the", "great"), {}))

{'philosophers': 5.432117781406298e-06, 'room': 6.372292012803541e-06, 'prince': 5.437092248239088e-06, 'and': 5.496785850232563e-06, 'man': 5.481862449734194e-06, 'bonus': 5.432117781406298e-06, 'wall': 5.481862449734194e-06, 'distractor': 5.432117781406298e-06, 'seducer': 5.432117781406298e-06, 'success': 5.437092248239088e-06, 'breakfast': 5.432117781406298e-06, 'shoe': 5.432117781406298e-06, 'unknown': 5.481862449734194e-06, 'halls': 5.432117781406298e-06, 'thing': 5.496785850232563e-06, 'granddaughter': 5.432117781406298e-06, 'part': 5.442066715071876e-06, 'reed': 5.432117781406298e-06, 'dane': 5.432117781406298e-06, 'lebron': 5.432117781406298e-06, 'news': 5.471913516068615e-06, 'state': 5.452015648737456e-06, 'view': 5.437092248239088e-06, 'francesco': 5.437092248239088e-06, 'qualities': 5.432117781406298e-06, 'times': 5.442066715071876e-06, 'sam': 5.432117781406298e-06, 'hall': 6.203160140488693e-06, 'forbidding': 5.432117781406298e-06, 'white': 5.442066715071876e-06, 'black': 

### Saving the model

In [None]:
import pickle

with open(r"..\models\trigram_model_bookcorpus.pkl", "wb") as f:
    pickle.dump(trigram_model_smoothed, f)
print("model saved")

model saved


Checking the model loading

In [None]:
with open(r"..\models\trigram_model_bookcorpus.pkl", "rb") as f:
    loaded_model = pickle.load(f)

print("model loaded")
print(loaded_model.get(("the", "great"), {}))

model loaded
{'philosophers': 5.432117781406298e-06, 'room': 6.372292012803541e-06, 'prince': 5.437092248239088e-06, 'and': 5.496785850232563e-06, 'man': 5.481862449734194e-06, 'bonus': 5.432117781406298e-06, 'wall': 5.481862449734194e-06, 'distractor': 5.432117781406298e-06, 'seducer': 5.432117781406298e-06, 'success': 5.437092248239088e-06, 'breakfast': 5.432117781406298e-06, 'shoe': 5.432117781406298e-06, 'unknown': 5.481862449734194e-06, 'halls': 5.432117781406298e-06, 'thing': 5.496785850232563e-06, 'granddaughter': 5.432117781406298e-06, 'part': 5.442066715071876e-06, 'reed': 5.432117781406298e-06, 'dane': 5.432117781406298e-06, 'lebron': 5.432117781406298e-06, 'news': 5.471913516068615e-06, 'state': 5.452015648737456e-06, 'view': 5.437092248239088e-06, 'francesco': 5.437092248239088e-06, 'qualities': 5.432117781406298e-06, 'times': 5.442066715071876e-06, 'sam': 5.432117781406298e-06, 'hall': 6.203160140488693e-06, 'forbidding': 5.432117781406298e-06, 'white': 5.442066715071876e-