In [1]:
import numpy as np
import pandas as pd
import nltk 
import re
import os
from nltk.tokenize import word_tokenize
from collections import defaultdict
import random

In [2]:
folder_path = "dataset"  
content_list = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            content = file.read()
            content_list.append(content)  
content_string = " ".join(content_list) # convert to string


def preprocess_text_first(text):
    text = text.lower() 
    text = re.sub(r"^\d+\.\s*", "", text, flags=re.MULTILINE)  # to remove the numbers from start of sentences
    text = text.replace('\n' , '_ ')
    return text

def preprocess_text_second(text):
    text = text.lower() 
    text = re.sub(r"^\d+\.\s*", "", text, flags=re.MULTILINE)  # to remove the numbers from start of sentences
    text = re.sub(r"\s+", " ", text).strip() 
    text = text.replace("\n", " ")
    text = re.sub(r"[^\w\s]", "", text) 
    return text

In [None]:
# two pre-processed texts one with '_' and second without '_' 
processedtext = preprocess_text_first(content_string)
processedtext2 = preprocess_text_second(content_string)

In [None]:
tokens = word_tokenize(processedtext) # To seperate starting words and ending words
tokens2 = word_tokenize(processedtext2)

### Get Starting Words List

In [6]:
starting_word = []
for i in range(len(tokens)):
    if '_' in tokens[i]:
        if tokens[i+1].isalpha():
            starting_word.append(tokens[i+1])

### Get Ending Words List

In [21]:
ending_words = []
for i in range(len(tokens)):
    if '_' in tokens[i]:
        ending_words.append(tokens[i])

In [93]:
cleaned_ending_words = []
for word in ending_words:
    cleaned_word = word.replace('_', '').replace('.','')
    
    if cleaned_word:
        cleaned_ending_words.append(cleaned_word)  

### Generating Counts of Unigram  , Bigram , Trigram



In [None]:
def build_unigram(tokens2):
    count = nltk.FreqDist(tokens2)
    return count

In [27]:
def build_bigram(tokens2):
    bigrams = nltk.ngrams(tokens2 , 2)
    bigrams_count = nltk.FreqDist(bigrams)
    return bigrams_count

In [28]:
def build_trigram(tokens2):
    trigrams = nltk.ngrams(tokens2 , 3)
    trigrams_count = nltk.FreqDist(trigrams)
    return trigrams_count

### Training of Unigrams

In [29]:
unigrams = build_unigram(tokens2)
unigrams

FreqDist({'aur': 472, 'ki': 393, 'ke': 328, 'phir': 324, 'ka': 243, 'baad': 221, 'se': 213, 'k': 202, 'or': 201, 'mein': 181, ...})

Generate Random Sentences using Unigrams

In [None]:
def generate_random_sentences_unigram(noofsentence = 5):
    diary = []
    sentence = ''

    for i in range(noofsentence):   
        sentence = random.choice(starting_word)
        
        random_number = random.randint(7,12)
        
        for j in range(random_number-1):
            frequent_random_word = random.choices(list(unigrams.keys()), weights=unigrams.values())[0]  # Choose based on frequency
            sentence += ' '+(frequent_random_word)
        diary.append(sentence)
    return "\n".join(diary)  


In [31]:
print(generate_random_sentences_unigram(5))  # Generates 5 sentences

phir keen baare university kaam lunch university cafe gya me dostoun
raat kisi se main mujhe unhon or se khaya plans
kaam kaam nashtay wapas 11 se lene subah tak coffee
us kya parha ma gi torhi 20 kiya ke kae main futsal
ajj dip hm tha mausam karnay gayi deir gaye


### Training of Bigrams

In [32]:
bigrams = build_bigram(tokens2)
bigrams

FreqDist({('ke', 'baad'): 63, ('aur', 'phir'): 55, ('kiya', 'aur'): 49, ('ki', 'namaz'): 49, ('k', 'baad'): 46, ('chala', 'gaya'): 45, ('ki', 'aur'): 44, ('ke', 'liye'): 44, ('or', 'phir'): 42, ('khana', 'khaya'): 39, ...})

Generate Random Sentences using Bigrams

In [None]:
def generate_random_sentences_bigram(noofsentence=5):
    diary = []    
    for i in range(noofsentence):   
        sentence = [random.choice(starting_word)]  # Choose a starting word
        
        random_number = random.randint(7, 12)  # Sentence length
    
        for j in range(random_number-1):
            probabilities = {}
            current_word = sentence[-1]
            relevent_bigrams = []
            for pair , freq in bigrams.items():
                if pair[0] == current_word:
                    relevent_bigrams.append((pair[1] , freq))
                    
            # print(relevent_bigrams)
           
            total_count = 0
            for i , freq in relevent_bigrams:
                total_count += freq
                
            
            words = []
            probabilities = []
            for pair, freq in relevent_bigrams:
                words.append(pair)
                probabilities.append(freq / total_count)  # Compute probabilities
            
            next_word = random.choices(words, weights=probabilities)[0]
            sentence.append(next_word)
        
        diary.append(" ".join(sentence))
    
    return "\n".join(diary)

In [34]:
print(generate_random_sentences_bigram(5))  # Generates 5 sentences

nashta kia 730 utha or phir a
uskai baad hum sary dost mil kr lia nikli
phir ghar aakr khana khaya lunch break thi
uske baad lagataar classes attend ki class cloud computing ki aur soogaau
ghar walon ke ooper thori discussion huwi


### Training of Trigrams

In [35]:
trigrams = build_trigram(tokens2)
trigrams

FreqDist({('ki', 'namaz', 'parhi'): 16, ('so', 'gaya', '1'): 13, ('namaz', 'ada', 'ki'): 13, ('raat', 'ka', 'khana'): 12, ('university', 'ke', 'liye'): 12, ('khana', 'khaya', 'aur'): 11, ('ki', 'namaz', 'ada'): 11, ('nashta', 'kiya', 'aur'): 10, ('namaz', 'parhi', 'aur'): 10, ('ka', 'khana', 'khaya'): 10, ...})

Generate Random Sentences using Trigrams

In [None]:
def generate_random_sentences_trigram(noofsentence=5):
    diary = []    
    for i in range(noofsentence):
           
        # Bigram code for appending second word in sentence list for Trigram model to work
        
        first_word = random.choice(starting_word)  # Choose a starting word
        sentence = [first_word]
        relevent_bigrams = []
        for pair , freq in bigrams.items():
            if pair[0] == first_word:
                relevent_bigrams.append((pair[1] , freq))
        if relevent_bigrams:
            total_count = 0
            for i , freq in relevent_bigrams:
                total_count += freq
            words = []
            probabilities = []
            for word , freq in relevent_bigrams:
                words.append(word)
                probabilities.append(freq / total_count)
            second_word = random.choices(words , weights=probabilities)[0]
        else:
            second_word = random.choice(starting_word)
            
        sentence.append(second_word)
        
        
        random_number = random.randint(7, 12)  # Sentence length
        # Trigram model 
        
        for j in range(random_number-1):
            probabilities = {}
            current_pair = (sentence[-2], sentence[-1]) 
            relevant_trigrams = []
            
            
            for pair , freq in trigrams.items():
                if(pair[0] , pair[1]) == current_pair:
                    relevant_trigrams.append((pair[2] , freq))
        
            total_count = 0
            for i , freq in relevant_trigrams:
                total_count += freq
            
            words = []
            probabilities = []
            for word , freq in relevant_trigrams:
                words.append(word)
                probabilities.append(freq / total_count)
            next_word = random.choices(words, weights=probabilities)[0]
            sentence.append(next_word)
        
        diary.append(" ".join(sentence))
    return "\n".join(diary)

In [37]:
print(generate_random_sentences_trigram(5))

gaming keh baad dawai li aur sogia so
phir bhai ko school chora or phir baaqi classes li
ke bd mein ne project ko mukl krna tha
ghar per nahi tha toh raat ka khana khanay
us k bad neurologist ko refer ker dia unho


### Backward Bigram Model

In [158]:
def generate_random_sentence_backward_bigram(noofsentences = 5):
    diary = []
    for i in range(noofsentences):
        sentence = [random.choice(cleaned_ending_words)]
        random_number = random.randint(7 , 12)
        
        for j in range(random_number-1):
            current_word = sentence[0]
            relevent_bigrams = []
            for pair , freq in bigrams.items():
                if pair[1] == current_word:
                    relevent_bigrams.append((pair[0] , freq))
        
            if not relevent_bigrams:
                break

            count = 0
            for i , freq in relevent_bigrams: # Get total count
                count += freq
                
            words = []
            probabilities = []
            
            for word , freq in relevent_bigrams:
                words.append(word)
                probabilities.append(freq/count)

            next_word = random.choices(words, weights=probabilities)[0]
            sentence.insert(0, next_word)  # Insert at the beginning for backward bigram model
    
        
        diary.append(" ".join(sentence))
    return "\n".join(diary)

In [159]:
print(generate_random_sentence_backward_bigram(5))

agaya aur halka dard tha aur shower le
ki koshish ki dieting wali class li
730 pae main jaldi jaldi utha or salar ko alwida kaha
so gayi 5 beje nikal kar diya tha
ghar jaake picture khichwayi or phir se le movie laga


### Bi Directional Bigram Model

In [152]:
def generate_random_sentence_bidirectional_bigram(noofsentences):
    diary = []
    max_backward_prob = 0
    max_forward_prob = 0
    for i in range(noofsentences):
        sentence = [random.choice(tokens2)]
        random_number = random.randint(7 , 12)
        current_word = sentence[-1]        
        for j in range(random_number-1):
            
            relevent_bigrams_forward = []
            relevent_bigrams_backward = []
            
            for pair , freq in bigrams.items():
                if pair[0] == current_word:
                    relevent_bigrams_forward.append((pair[1] , freq))
                if pair[1] == current_word:
                    relevent_bigrams_backward.append((pair[0] , freq))
                    
            if not relevent_bigrams_forward and not relevent_bigrams_backward:
                break
        
            total_forward = 0
            for _ , freq in relevent_bigrams_forward:
                total_forward += freq
            
            total_backward = 0
            for _ , freq in relevent_bigrams_backward:
                total_backward += freq
            
            forward_words = []
            forward_probabilities = []
            for word , freq in relevent_bigrams_forward:
                forward_words.append(word)
                forward_probabilities.append(freq / total_forward)
            
            backward_words = []
            backward_probabilities = []
            for word , freq in relevent_bigrams_backward:
                backward_words.append(word)
                backward_probabilities.append(freq / total_backward)
            
            max_forward_prob = max(forward_probabilities)
            max_backward_prob = max(backward_probabilities)
            
            if max_backward_prob > max_forward_prob:
                next_word = random.choices(backward_words , weights = backward_probabilities)[0]
                sentence.insert(0, next_word)
                current_word = sentence[0]
            else:
                next_word = random.choices(forward_words , weights = forward_probabilities)[0]
                sentence.append(next_word)
                current_word = sentence[-1]
                
        diary.append(" ".join(sentence))
    return "\n".join(diary)    

In [153]:
print(generate_random_sentence_bidirectional_bigram(5))

neend mein ne 12 ya nahi aa farmaye 11 baje uth
kei karta raha dopahar ka sath azkaar
ki ho university ki namaz or me gya raha phir wajah se
kiya aur casual scrolling ki us k client ke
mil poohnch kr diya raha raha nahi tha tha tha kr
