In [87]:
import re
import random
def test_train_split(corpus, n):
    # remove new line
    corpus = corpus.replace('\n', ' ')
    # split into sentences
    sentences = re.split(r'(?<=[.!?]) +', corpus)
    test_sentences = random.sample(sentences, n)
    train_sentences = [sentence for sentence in sentences if sentence not in test_sentences]
    return test_sentences, train_sentences


def tokenize(text):
    url_pattern1 = r"(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])"
    url_pattern2 = r'www\.[^\s\.]+(?:\.[^\s\.]+)*(?:[\s\.]|$)'
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    mention_pattern = "@\w+"
    hastag_pattern = "#[a-z0-9_]+"
    normal_pattern = "[a-zA-Z]+"
    number_pattern = "[0-9]+"
    tokens = []
    text = text.lower()
    text = re.sub(url_pattern1, '<URL> ', text)
    text = re.sub(url_pattern2, '<URL> ', text)
    text = re.sub(email_pattern, '<MAILID> ', text)
    text = re.sub(hastag_pattern, '<HASHTAG> ', text)
    text = re.sub(mention_pattern, '<MENTION> ', text)
    text = re.sub(number_pattern, '<NUM> ', text)
    tokens = re.findall(r'\b\w+|[^\s\w<>]+|<\w+>', text)
    return tokens


coupus_path = './corpus'
corpus1 = "./corpus/Pride and Prejudice - Jane Austen.txt"
corpus2 = "./corpus/Ulysses  James Joyce.txt"
with open(corpus1, 'r', encoding='utf-8') as f:
    text1 = f.read()
test_sentences, train_sentences = test_train_split(text1, 1000)

In [88]:
def perform_cleaning(text):
    # remove comma, extra spaces, and punctuations
    text = re.sub(r'[,!?;-]+', '', text)
    if text.endswith('.'):
            text = text[:-1]#removing last dot also
    return text
def PerformNgram(corpus, n):
    pattern = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
    list_sentences = re.split(pattern, corpus)
    ngrams = {}
    for sentence in list_sentences:
        tokens = tokenize(sentence)
        # sentence = (n-1)*"<START> "+ sentence
        for i in range(len(tokens)-n+1):
            temp = tuple(tokens[j] for j in range(i, i+n))  # Convert list to tuple
            if temp in ngrams:
                ngrams[temp] += 1
            else:
                ngrams[temp] = 1
            
    return ngrams

In [89]:
unigram_count = PerformNgram(" ".join(train_sentences), 1)
bigram_count = PerformNgram(" ".join(train_sentences), 2)
trigram_count = PerformNgram(" ".join(train_sentences), 3)
# # saving the ngrams
# with open('unigram_count.txt', 'w') as f:
#     for key, value in unigram_count.items():
#         f.write('%s:%s\n' % (key, value))
# with open('bigram_count.txt', 'w') as f:
#     for key, value in bigram_count.items():
#         f.write('%s:%s\n' % (key, value))
# with open('trigram_count.txt', 'w') as f:
#     for key, value in trigram_count.items():
#         f.write('%s:%s\n' % (key, value))

In [90]:
# Now I have to find all probabilities
unigram_prob = {}
bigram_prob = {}
trigram_prob = {}
for key, value in unigram_count.items():
    unigram_prob[key] = value / len(unigram_count)
# print(unigram_prob)

# probabilty of bigram
# p(w2|w1) = count(w1, w2)/count(w1)
for bigram, count in bigram_count.items():
    w1 = bigram[0]
    w1_token = (w1,)
    bigram_prob[bigram] = count / unigram_count[w1_token]
# with open('bigram_prob.txt', 'w') as f:
#     for key, value in bigram_prob.items():
#         f.write('%s:%s\n' % (key, value))

# similarly for trigram
for trigram, w1_w2_w3 in trigram_count.items():
    w1_w2_token = (trigram[0], trigram[1],)
    trigram_prob[trigram] = w1_w2_w3 / bigram_count[w1_w2_token]
# with open('trigram_prob.txt', 'w') as f:
#     for key, value in trigram_prob.items():
#         f.write('%s:%s\n' % (key, value))

In [91]:
def linear_interpolation(trigram, unigram_prob, bigram_prob, trigram_prob):
    lambda1 = 0.4
    lambda2 = 0.3
    lambda3 = 0.3
    unigram_probability = unigram_prob.get(trigram[-1], 0)
    bigram_probability = bigram_prob.get(tuple(trigram[-2:]), 0)
    trigram_probability = trigram_prob.get(tuple(trigram[-3:]), 0)

    unigram_probability = lambda3 * unigram_probability  
    if unigram_probability == 0:
        unigram_probability = 0.00001
    
    trigram_probability = lambda1 * trigram_probability
    if trigram_probability == 0:
        trigram_probability = 1/len(trigram_prob)

    bigram_probability = lambda2 * bigram_probability
    if bigram_probability == 0:
        bigram_probability = 1/len(bigram_prob)

    interpolated_prob = trigram_probability + bigram_probability + unigram_probability
    return interpolated_prob


test_trigram_count = PerformNgram(" ".join(test_sentences), 3)
with open('test_trigram_cnt.txt', 'w') as f:
    for key, value in test_trigram_count.items():
        f.write('%s:%s\n' % (key, value))

test_trigram_prob = {}
for trigram, count in test_trigram_count.items():
    test_trigram_prob[trigram] = linear_interpolation(trigram, unigram_prob, bigram_prob, trigram_prob)
    # break
with open('test_trigram_prob.txt', 'w') as f:
    for key, value in test_trigram_prob.items():
        f.write('%s:%s\n' % (key, value))   

In [92]:
# <!-- Now finding perplexity -->
import math
def perplexity(sentence, unigram_prob, bigram_prob, trigram_prob):
    tokens = tokenize(sentence)
        # in this tuple add <start> <start> at the start of the sentence and <end> at the end of the sentence
    tokens = ('<START>', '<START>',) + tuple(tokens) + ('<END>',)
    # break  
    log_probability_sum = 0.0
    trigram_count = 1
    for i in range(len(tokens)-3):
        trigram = tuple(tokens[i:i+3])
        trigram_count += 1
        temp_prob = math.log(linear_interpolation(trigram, unigram_prob, bigram_prob, trigram_prob))
        # log_probability_sum += math.log(linear_interpolation(trigram, unigram_prob, bigram_prob, trigram_prob))
        log_probability_sum += temp_prob

    sentence_perplexity = math.exp(-(log_probability_sum / trigram_count))
    return sentence_perplexity

Perplexity for train

In [93]:
# Now I have to find the perplexity
train_perplexity = {}#perplexity for each sentence
for sentence in train_sentences:
    token = tokenize(sentence)
    sentence = ' '.join(token)
    train_perplexity[sentence] = perplexity(sentence, unigram_prob, bigram_prob, trigram_prob)
# Find avg perplexity
avg_perplexity = sum(train_perplexity.values()) / len(train_perplexity)
with open('./2022201041_LM4_train-perplexity.txt', 'w', encoding='utf-8') as f:
    f.write('Average Perplexity: %s\n\n' % avg_perplexity)
    for key, value in train_perplexity.items():
        f.write('%s : %s\n' % (key, value))



For each sentence in test

Perplexity for test

In [94]:
# Now I have to find the perplexity
test_perplexity = {}#perplexity for each sentence
for sentence in test_sentences:
    token = tokenize(sentence)
    sentence = ' '.join(token)
    test_perplexity[sentence] = perplexity(sentence, unigram_prob, bigram_prob, trigram_prob)
avg_perplexity = sum(test_perplexity.values())/ len(test_perplexity)
with open('2022201041_LM4_test-perplexity.txt', 'w') as f:
    f.write('Average Perplexity: %s\n\n' % avg_perplexity)
    for key, value in test_perplexity.items():
        f.write('%s : %s\n' % (key, value))

In [96]:
st = 'An apple a day keeps the doctor'
token = tokenize(st)

In [97]:
w1 = token[-2:]
w1 = tuple(w1,)
w1

('the', 'doctor')

In [119]:
import heapq
st = 'a carat character is'
token = tokenize(st)
w1 = token[-2:]
w1 = tuple(w1,)
k = 3
# Initialize a dictionary to store word probabilities
word_probabilities = {}
for sentence in train_sentences:
    # Split the sentence into words
    words = tokenize(sentence)
    
    # Extract the last two words and convert them into a tuple
    for eachword in words:
        w = (w1,eachword)
        w = w[0] + (w[1],)
        prob = linear_interpolation(w,unigram_prob, bigram_prob, trigram_prob)
        word_probabilities[eachword] = prob
# Print the list of tuples
sorted_word_probabilities = sorted(word_probabilities.items(), key=lambda x: x[1], reverse=True)
for word, prob in sorted_word_probabilities[:k]:
    print(f"Probability: {prob}, Word: {word}")

Probability: 0.1350146425255339, Word: more
Probability: 0.1337611606313835, Word: used
Probability: 0.1337611606313835, Word: thereby


In [99]:
k = 3

In [100]:
import heapq
min_heap = []

# Iterate through each sentence in train_sentences
for sentence in train_sentences:
    # Split the sentence into words
    words = tokenize(sentence)
    
    # Iterate through each word in the sentence
    for i in range(len(words)):
        # Extract the last two words and current word
        if i >= 2:
            w = tuple(words[i-2:i+1])
            # Calculate probability using linear interpolation
            prob = linear_interpolation(w, unigram_prob, bigram_prob, trigram_prob)
            # Push the probability and word tuple into the min heap
            # Negate the probability to use min heap as max heap
            heapq.heappush(min_heap, (-prob, w))
            # If the heap size exceeds k, pop the smallest element
            if len(min_heap) > k:
                heapq.heappop(min_heap)

# Now min_heap contains the k tuples with the highest probabilities
# You can iterate through the heap to access the top k tuples
# To access the probability and word tuple, you can negate the probability again
# to restore the original positive value

# Example of accessing the top k tuples
for neg_prob, word_tuple in min_heap:
    prob = -neg_prob
    print(f"Probability: {prob}, Word Tuple: {word_tuple}")

Probability: 0.00040028696419796574, Word Tuple: (',', 'and', 'wondered')
Probability: 0.00040028696419796574, Word Tuple: (',', 'and', 'wretched')
Probability: 0.00040028696419796574, Word Tuple: (',', 'and', 'written')
