In [23]:
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from nltk.corpus import brown
import nltk
from sklearn.model_selection import KFold
import random
# Download the Brown Corpus if not already available
nltk.download('brown')



[nltk_data] Downloading package brown to C:\Users\HARSHIT
[nltk_data]     JAIN\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [32]:

class TrigramLanguageModel:
    def __init__(self):
        self.unigram_counts = {}
        self.bigram_counts = {}
        self.trigram_counts = {}
        self.vocab = set()
        self.total_unigrams = 0

    def preprocess(self, corpus):
        processed_corpus = []
        for sentence in corpus:
            processed_sentence = [word.lower() for word in sentence]
            processed_corpus.append(processed_sentence)
        return processed_corpus

    def train(self, corpus):
        for sentence in corpus:
            tokens = ['<s>', '<s>'] + sentence + ['</s>']
            for i in range(len(tokens)):
                if tokens[i] not in self.unigram_counts:
                    self.unigram_counts[tokens[i]] = 1
                self.unigram_counts[tokens[i]] += 1
                self.vocab.add(tokens[i])
                if i > 0:
                    bigram = (tokens[i - 1], tokens[i])
                    if bigram not in self.bigram_counts:
                        self.bigram_counts[bigram] = 1
                    self.bigram_counts[bigram] += 1
                if i > 1:
                    trigram = (tokens[i - 2], tokens[i - 1], tokens[i])
                    if trigram not in self.trigram_counts:
                        self.trigram_counts[trigram] = 0
                    self.trigram_counts[trigram] += 1

            self.total_unigrams += (len(tokens) - 3)

    def laplace_smoothing(self, w1, w2, w3):
        trigram = (w1, w2, w3)
        bigram = (w1, w2)
        trigram_count = 0
        if trigram in self.trigram_counts :
              trigram_count = self.trigram_counts[trigram]
        bigram_count = 0
        if bigram in self.bigram_counts :
              bigram_count = self.bigram_counts[bigram]
        vocab_size = len(self.vocab)
        return (trigram_count + 1) / (bigram_count + vocab_size)

    def interpolation(self, w1, w2, w3, lambdas=(0.1, 0.3, 0.6)):
        """Calculates probability using simple interpolation."""
        lambda1, lambda2, lambda3 = lambdas
        unigram_prob = self.unigram_counts.get(w3, 0) / self.total_unigrams if w3 in self.unigram_counts else 1 / len(self.vocab)
        bigram_prob = self.bigram_counts.get((w2, w3), 0) / self.unigram_counts.get(w2, 1)
        trigram_prob = self.trigram_counts.get((w1, w2, w3), 0) / self.bigram_counts.get((w1, w2), 1)
        return lambda1 * unigram_prob + lambda2 * bigram_prob + lambda3 * trigram_prob

    def sentence_probability(self, sentence, method='laplace',lambdas=(0.1, 0.3, 0.6)):
        tokens = ['<s>', '<s>'] + sentence + ['</s>']
        log_prob = 0
        for i in range(2, len(tokens)):
            if method == 'laplace':
                prob = self.laplace_smoothing(tokens[i - 2], tokens[i - 1], tokens[i])
            elif method == 'interpolation':
                prob = self.interpolation(tokens[i - 2], tokens[i - 1], tokens[i],lambdas)
            log_prob += math.log(prob)
        return log_prob

    def calculate_perplexity(self, test_corpus, method='laplace',lambdas=(0.1, 0.3, 0.6)):
        total_log_prob = 0
        total_words = 0
        for sentence in test_corpus:
            total_log_prob += self.sentence_probability(sentence, method,lambdas=(0.1, 0.3, 0.6))
            total_words += len(sentence)
        avg_log_prob = total_log_prob / total_words
        perplexity = math.exp(-avg_log_prob)
        return perplexity
        
    def cross_validate_grid_search(self, corpus, lambdas_grid, num_folds=5):
        """Grid search to find the best interpolation parameters."""
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
        best_lambdas = None
        best_perplexity = float('inf')

        # Prepare for cross-validation
        for lambdas in lambdas_grid:
            total_perplexity = 0
            for train_index, val_index in kf.split(corpus):
                train_data = [corpus[i] for i in train_index]
                val_data = [corpus[i] for i in val_index]
                
                # Train the model on the training fold
                self.train(train_data)
                
                # Calculate the perplexity on the validation fold
                perplexity = self.calculate_perplexity(val_data, method='interpolation', lambdas=lambdas)
                total_perplexity += perplexity
            
            avg_perplexity = total_perplexity / num_folds
            print(f"Lambdas: {lambdas}, Perplexity: {avg_perplexity}")
            
            # Update best parameters if the current perplexity is better
            if avg_perplexity < best_perplexity:
                best_perplexity = avg_perplexity
                best_lambdas = lambdas

        print(f"Best lambda parameters: {best_lambdas} with Perplexity: {best_perplexity}")
        return best_lambdas
        
    def generate_sentence(self, method='laplace', max_length=20, lambdas = (1.0, 0.0, 0.0)):
        sentence = ['<s>', '<s>']
        for _ in range(max_length):
            candidates = list(self.vocab)
            probabilities = []
            for word in candidates:
                if method == 'laplace':
                    prob = self.laplace_smoothing(sentence[-2], sentence[-1], word)
                elif method == 'interpolation':
                    prob = self.interpolation(sentence[-2], sentence[-1], word, lambdas)
                probabilities.append(prob)
            # Normalize probabilities
            probabilities_sum = sum(probabilities)
            probabilities = [p / probabilities_sum for p in probabilities]
            next_word = random.choices(candidates, probabilities)[0]
            if next_word == '</s>':
                break
            sentence.append(next_word)
        return ' '.join(sentence[2:])




In [40]:

sentences = brown.sents(categories='news') 
corpus = [list(map(str.lower, sentence)) for sentence in sentences]
model = TrigramLanguageModel()
processed_corpus = model.preprocess(corpus)
train_corpus, test_corpus = train_test_split(processed_corpus, test_size=0.2, random_state=42)

model.train(train_corpus)

# Calculate perplexity using both methods
perplexity_laplace = model.calculate_perplexity(test_corpus, method='laplace')
perplexity_interpolation = model.calculate_perplexity(test_corpus, method='interpolation',lambdas=(1.0, 0.0, 0.0))

print("Perplexity (Laplace smoothing):", perplexity_laplace)
print("Perplexity (Interpolation):", perplexity_interpolation)

# Generate 20 sentences using both methods
print("\nGenerated sentences using Laplace smoothing:")
for _ in range(20):
    print(model.generate_sentence(method='laplace'))

print("\nGenerated sentences using Interpolation:")
for _ in range(20):
    print(model.generate_sentence(method='interpolation',lambdas = (1.0, 0.0, 0.0)))

Perplexity (Laplace smoothing): 12765.118789527742
Perplexity (Interpolation): 1243.8422400852337

Generated sentences using Laplace smoothing:
kaiser hillsboro frothier noses belanger stop investigation greens mingle broadcast dichotomy optimism offers u. barker banshees dumont unlikely cox mercy
protocol livermore paying oust finan defendants marines journalism fullback aerials bound plead jacqueline phases ryne werner 3rd $15,000,000 renew fiercest
$43,000 flower schenk book-review sports r-bergen notable beaten $16 choking unlikely probing gil leonard building clearing twenty dallas-headquartered bearing makes
broadway emphasized 1947-49 1,700 smelts excessive phoenix turner suspect deliberation simpson's join dealt diplomats stabilization $4,177.37 dress rotelli distance $4,800
1896 bootle's minneapolis eva 100-yard piety stages letting sexton among hits river downed drexel's ponce grasp 1688 wages la scoreboards
the something extraordinarily 70,000 tragedies proven school's 281 2

In [None]:
lambda_values = np.arange(0.0, 1.05, 0.05) 
lambdas_grid = [(lambda1, lambda2, 1 - lambda1 - lambda2) 
                    for lambda1 in lambda_values 
                    for lambda2 in lambda_values if lambda1 + lambda2 <= 1]
best_lambdas = model.cross_validate_grid_search(train_corpus, lambdas_grid)