<a href="https://colab.research.google.com/github/MapariPrajwal/NLP/blob/main/language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
corpus = [
    "I love natural language processing",
    "Natural language processing is fun",
    "Language models are powerful tools",
    "Tools like ChatGPT assist in various tasks",
    "Tasks such as next word prediction"
]


In [2]:
import string

def preprocess(corpus):
    processed_corpus = []
    for sentence in corpus:
        sentence = sentence.lower()
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        words = sentence.split()
        processed_corpus.append(words)
    return processed_corpus

processed_corpus = preprocess(corpus)


In [3]:
def generate_ngrams(corpus, n):
    ngrams = []
    for sentence in corpus:
        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i:i+n])
            ngrams.append(ngram)
    return ngrams

unigrams = [word for sentence in processed_corpus for word in sentence]
bigrams = generate_ngrams(processed_corpus, 2)
trigrams = generate_ngrams(processed_corpus, 3)

print("Unigrams:", unigrams)
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)

Unigrams: ['i', 'love', 'natural', 'language', 'processing', 'natural', 'language', 'processing', 'is', 'fun', 'language', 'models', 'are', 'powerful', 'tools', 'tools', 'like', 'chatgpt', 'assist', 'in', 'various', 'tasks', 'tasks', 'such', 'as', 'next', 'word', 'prediction']
Bigrams: [('i', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing'), ('natural', 'language'), ('language', 'processing'), ('processing', 'is'), ('is', 'fun'), ('language', 'models'), ('models', 'are'), ('are', 'powerful'), ('powerful', 'tools'), ('tools', 'like'), ('like', 'chatgpt'), ('chatgpt', 'assist'), ('assist', 'in'), ('in', 'various'), ('various', 'tasks'), ('tasks', 'such'), ('such', 'as'), ('as', 'next'), ('next', 'word'), ('word', 'prediction')]
Trigrams: [('i', 'love', 'natural'), ('love', 'natural', 'language'), ('natural', 'language', 'processing'), ('natural', 'language', 'processing'), ('language', 'processing', 'is'), ('processing', 'is', 'fun'), ('language', 'models

In [4]:
from collections import Counter

def add_one_smoothing(ngrams):
    ngram_counts = Counter(ngrams)
    vocabulary_size = len(set(ngrams))
    smoothed_counts = {}
    for ngram, count in ngram_counts.items():
        smoothed_counts[ngram] = (count + 1) / (ngram_counts[ngram[:-1]] + vocabulary_size)
    return smoothed_counts

smoothed_bigrams = add_one_smoothing(bigrams)
smoothed_trigrams = add_one_smoothing(trigrams)

print("Smoothed Bigrams:", smoothed_bigrams)
print("Smoothed Trigrams:", smoothed_trigrams)

Smoothed Bigrams: {('i', 'love'): 0.09523809523809523, ('love', 'natural'): 0.09523809523809523, ('natural', 'language'): 0.14285714285714285, ('language', 'processing'): 0.14285714285714285, ('processing', 'is'): 0.09523809523809523, ('is', 'fun'): 0.09523809523809523, ('language', 'models'): 0.09523809523809523, ('models', 'are'): 0.09523809523809523, ('are', 'powerful'): 0.09523809523809523, ('powerful', 'tools'): 0.09523809523809523, ('tools', 'like'): 0.09523809523809523, ('like', 'chatgpt'): 0.09523809523809523, ('chatgpt', 'assist'): 0.09523809523809523, ('assist', 'in'): 0.09523809523809523, ('in', 'various'): 0.09523809523809523, ('various', 'tasks'): 0.09523809523809523, ('tasks', 'such'): 0.09523809523809523, ('such', 'as'): 0.09523809523809523, ('as', 'next'): 0.09523809523809523, ('next', 'word'): 0.09523809523809523, ('word', 'prediction'): 0.09523809523809523}
Smoothed Trigrams: {('i', 'love', 'natural'): 0.11764705882352941, ('love', 'natural', 'language'): 0.1176470588

In [5]:
def predict_next_word(prefix, smoothed_ngrams):
    candidates = [(ngram[-1], prob) for ngram, prob in smoothed_ngrams.items() if ngram[:-1] == tuple(prefix)]
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[0][0] if candidates else None

prefix = ["natural", "language"]
next_word = predict_next_word(prefix, smoothed_trigrams) or predict_next_word(prefix[-1:], smoothed_bigrams)
print("Next word prediction:", next_word)

Next word prediction: processing
