In [None]:
import zipfile
with zipfile.ZipFile('/content/BPE_dataset.zip', 'r') as zip_ref:
  zip_ref.extractall("extracted_files")


In [None]:
DATASET_DIR = "/content/extracted_files/dataset"
# Dictionary for Roman Urdu normalization
normalization_dict = {
    "mein": "main", "me": "main", "mera": "mera", "mujhe": "mujhe", "kr": "kar",
    "kiya": "kiya", "ke": "ke", "ka": "ka", "tha": "tha", "tha": "tha", "hua": "hua",
    "nahi": "nahi", "nai": "nahi", "unke": "unke", "bht": "bohot", "bhut": "bohot",
    "bohat": "bohot", "acha": "acha", "achi": "achi", "acha": "acha", "aap": "aap",
    "tum": "tum", "uske": "uske", "uska": "uska", "sb": "sab", "kyunki": "kyunki",
    "koi": "koi", "aise": "aise", "wahan": "wahan", "udhar": "udhar", "yahan": "yahan",
    "se": "se", "ja": "ja", "gaya": "gaya", "gayi": "gayi", "liye": "liye",
    "pohunch": "pohunch", "beth": "baith", "bhai": "bhai", "doston": "dost", "kaam": "kaam"
}



In [None]:
import re
def clean_data(text):
    #Remove numbers at the start of each line
    cleaned_lines = [re.sub(r'^\d+[\).\-\s]+', '', line) for line in text.split('\n')]

    #Remove all punctuations except spaces
    cleaned_lines = [re.sub(r'[^\w\s]', '', line) for line in cleaned_lines]

    #Convert to lowercase
    cleaned_lines = [line.lower() for line in cleaned_lines]
    #Normalize Roman Urdu words
    cleaned_lines = [' '.join([normalization_dict.get(word, word) for word in line.split()]) for line in cleaned_lines]
    # Return cleaned text
    return '\n'.join(cleaned_lines)

In [None]:
import os
def load_data(directory):
  all_text=""
  for filename in os.listdir(directory):
    if filename.endswith('.txt'):
      with open(os.path.join(directory, filename), "r", encoding="utf-8", errors="ignore") as file:
        all_text +=clean_data(file.read())+" "
  return all_text.strip()

In [None]:
corpus=load_data(DATASET_DIR)
print(corpus)
print(len(corpus))

raat ko itla mili thi ke freelancing ki class cancel hogayi hai lehaza subah qareeb 11 baje main nlp ke lecture ke liye tayyar huwa namaz ada ki or university ke liye rawana huwa
taqreeban 1135 am tak lecture attend kiya or kaaf ikuch seekhne ko mila ke ye jo main abhi diary likh raha is ka asal maqsad kya hona chahiye
khair nlp ke baad deep learning ka lecture attend kiya
lectures ke baad class fellows se mulaqaat huwi or bohot qareebi dost omer se mila
us ne bataya ke uske bhai ka accident huwa tha or uske aik dost ki accident main mout waqe hogayi allah darjaat buland kare
afsoos ke baad mene cafe se pulao or omer ne biryani khayi gupshup ke baad ghar ko rawana huwe
ghar aake namaz ada ki or apni walida se baat ki unhe waqt dene ke baad chaye banayi or biscuits ke saath pi
uske baad office join karne se pehlay maghreb ki namaz ada ki or kal ki tarah office se aaj ke din ka haal bayaan kar raha hoon
thora sa break leke shami kebab fry khaya hai or 11 pm per office se sign out karke e

In [None]:
import math
from collections import defaultdict


# Tokenization (split by space)
tokens = corpus.lower().split()

print("Tokens:", tokens)  # Checking tokenization


Tokens: ['raat', 'ko', 'itla', 'mili', 'thi', 'ke', 'freelancing', 'ki', 'class', 'cancel', 'hogayi', 'hai', 'lehaza', 'subah', 'qareeb', '11', 'baje', 'main', 'nlp', 'ke', 'lecture', 'ke', 'liye', 'tayyar', 'huwa', 'namaz', 'ada', 'ki', 'or', 'university', 'ke', 'liye', 'rawana', 'huwa', 'taqreeban', '1135', 'am', 'tak', 'lecture', 'attend', 'kiya', 'or', 'kaaf', 'ikuch', 'seekhne', 'ko', 'mila', 'ke', 'ye', 'jo', 'main', 'abhi', 'diary', 'likh', 'raha', 'is', 'ka', 'asal', 'maqsad', 'kya', 'hona', 'chahiye', 'khair', 'nlp', 'ke', 'baad', 'deep', 'learning', 'ka', 'lecture', 'attend', 'kiya', 'lectures', 'ke', 'baad', 'class', 'fellows', 'se', 'mulaqaat', 'huwi', 'or', 'bohot', 'qareebi', 'dost', 'omer', 'se', 'mila', 'us', 'ne', 'bataya', 'ke', 'uske', 'bhai', 'ka', 'accident', 'huwa', 'tha', 'or', 'uske', 'aik', 'dost', 'ki', 'accident', 'main', 'mout', 'waqe', 'hogayi', 'allah', 'darjaat', 'buland', 'kare', 'afsoos', 'ke', 'baad', 'mene', 'cafe', 'se', 'pulao', 'or', 'omer', 'ne', 

In [None]:
# Initialize frequency dictionaries
unigram_counts = defaultdict(int)
bigram_counts = defaultdict(int)
trigram_counts = defaultdict(int)

# Count occurrences
for i in range(len(tokens)):
    unigram_counts[tokens[i]] += 1
    if i > 0:
        bigram_counts[(tokens[i-1], tokens[i])] += 1
    if i > 1:
        trigram_counts[(tokens[i-2], tokens[i-1], tokens[i])] += 1

# Total unigrams
total_unigrams = sum(unigram_counts.values())

# Unique vocabulary size (for smoothing)
vocab_size = len(unigram_counts)

# Print sample counts
print("Unigram counts:", dict(list(unigram_counts.items())[:100]))  # Show first n=100
print("Bigram counts:", dict(list(bigram_counts.items())[:100]))  # Show first n
print("Trigram counts:", dict(list(trigram_counts.items())[:100]))  # Show first n


Unigram counts: {'raat': 55, 'ko': 118, 'itla': 1, 'mili': 4, 'thi': 93, 'ke': 328, 'freelancing': 2, 'ki': 393, 'class': 89, 'cancel': 4, 'hogayi': 4, 'hai': 75, 'lehaza': 2, 'subah': 45, 'qareeb': 5, '11': 21, 'baje': 81, 'main': 312, 'nlp': 21, 'lecture': 23, 'liye': 113, 'tayyar': 2, 'huwa': 12, 'namaz': 86, 'ada': 18, 'or': 201, 'university': 128, 'rawana': 8, 'taqreeban': 6, '1135': 1, 'am': 4, 'tak': 82, 'attend': 23, 'kiya': 178, 'kaaf': 1, 'ikuch': 1, 'seekhne': 1, 'mila': 14, 'ye': 12, 'jo': 37, 'abhi': 17, 'diary': 5, 'likh': 8, 'raha': 50, 'is': 19, 'ka': 243, 'asal': 1, 'maqsad': 1, 'kya': 22, 'hona': 6, 'chahiye': 3, 'khair': 5, 'baad': 221, 'deep': 4, 'learning': 4, 'lectures': 10, 'fellows': 1, 'se': 213, 'mulaqaat': 2, 'huwi': 7, 'bohot': 34, 'qareebi': 1, 'dost': 87, 'omer': 4, 'us': 38, 'ne': 105, 'bataya': 1, 'uske': 43, 'bhai': 10, 'accident': 2, 'tha': 179, 'aik': 30, 'mout': 1, 'waqe': 1, 'allah': 5, 'darjaat': 1, 'buland': 1, 'kare': 1, 'afsoos': 1, 'mene': 26, 

In [None]:
# Function to compute unigram probability
def unigram_prob(word):
    return (unigram_counts[word] + 1) / (total_unigrams + vocab_size)  # Laplace smoothing

# Function to compute bigram probability
def bigram_prob(w1, w2):
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[w1] + vocab_size)  # Laplace smoothing

# Function to compute trigram probability
def trigram_prob(w1, w2, w3):
    return (trigram_counts[(w1, w2, w3)] + 1) / (bigram_counts[(w1, w2)] + vocab_size)  # Laplace smoothing


In [None]:
import math
import random
from collections import defaultdict

class NGramModel:
    def __init__(self, n):
        self.n = n  # 1 for unigram, 2 for bigram, 3 for trigram
        self.n_gram_counts = defaultdict(int)
        self.n_1_gram_counts = defaultdict(int)
        self.vocab = set()
        self.total_tokens = 0

    def train(self, corpus):
        tokens = corpus.lower().split()
        self.vocab.update(tokens)
        self.total_tokens = len(tokens)

        for i in range(len(tokens) - self.n + 1):
            n_gram = tuple(tokens[i:i + self.n])
            self.n_gram_counts[n_gram] += 1

            if self.n > 1:
                n_1_gram = tuple(tokens[i:i + self.n - 1])
                self.n_1_gram_counts[n_1_gram] += 1

    def get_probability(self, words):
        """ Returns probability of the given n-gram (Laplace smoothed). """
        words = tuple(words)
        if self.n == 1:
            return (self.n_gram_counts[words] + 1) / (self.total_tokens + len(self.vocab))
        else:
            prefix = words[:-1]
            return (self.n_gram_counts[words] + 1) / (self.n_1_gram_counts[prefix] + len(self.vocab))

    def predict_next_word(self, context):
        """ Predicts the next word based on context using probabilities. """
        context = tuple(context[-(self.n - 1):]) if self.n > 1 else ()
        candidates = {word[-1]: self.get_probability(context + (word[-1],)) for word in self.n_gram_counts if word[:-1] == context}

        if not candidates:
            return random.choice(list(self.vocab))  # Return random word if context unseen

        return max(candidates, key=candidates.get)  # Return most probable word

# Train models
unigram_model = NGramModel(1)
bigram_model = NGramModel(2)
trigram_model = NGramModel(3)

unigram_model.train(corpus)
bigram_model.train(corpus)
trigram_model.train(corpus)

print("Training completed!")


Training completed!


In [None]:
def perplexity(model, test_sentence):
    test_tokens = test_sentence.lower().split()
    N = len(test_tokens)
    log_prob = 0

    for i in range(len(test_tokens) - model.n + 1):
        n_gram = test_tokens[i:i + model.n]
        prob = model.get_probability(n_gram)
        log_prob += math.log(prob)

    return math.exp(-log_prob / N)

# Test sentence
test_sentence = "wahan apni dost se mili humne saath lunch"
print("Unigram Perplexity:", perplexity(unigram_model, test_sentence))
print("Bigram Perplexity:", perplexity(bigram_model, test_sentence))
print("Trigram Perplexity:", perplexity(trigram_model, test_sentence))


Unigram Perplexity: 689.0109787077564
Bigram Perplexity: 340.695440324013
Trigram Perplexity: 212.59491969939728


In [None]:
# test sentence {wahan apni dost se mili humne saath lunch kiya aur baatein ki}
context = ["lunch","kiya"]
print("Unigram prediction:", unigram_model.predict_next_word(context))
print("Bigram prediction:", bigram_model.predict_next_word(context))
print("Trigram prediction:", trigram_model.predict_next_word(context))


Unigram prediction: aur
Bigram prediction: aur
Trigram prediction: aur


In [None]:
import random

class DiaryGenerator:
    def __init__(self, bigram_model, trigram_model, start_words):
        self.bigram_model = bigram_model
        self.trigram_model = trigram_model
        self.start_words = start_words  # List of words that can start a sentence

    def generate_diary_entry(self, max_words=None):
        if max_words is None:
            max_words = random.randint(7, 12)  # Random length between 7-12

        # Start with a random word from known starting words
        entry = [random.choice(self.start_words)]

        while len(entry) < max_words:
            if len(entry) >= 2:
                # Use trigram model if possible
                next_word = self.trigram_model.predict_next_word(entry[-2:])
            else:
                # Use bigram model
                next_word = self.bigram_model.predict_next_word(entry[-1:])

            if not next_word:
                break  # Stop if no valid word found

            entry.append(next_word)

        return " ".join(entry).capitalize() + "."

# Define possible start words (words commonly starting a diary entry)
start_words = ["aaj", "raat", "kal", "subha", "shaam", "jab"]

# Train bigram and trigram models
bigram_model = NGramModel(2)
trigram_model = NGramModel(3)

bigram_model.train(corpus)
trigram_model.train(corpus)

# Generate a diary entry
diary_generator = DiaryGenerator(bigram_model, trigram_model, start_words)
print(diary_generator.generate_diary_entry())


Aaj subah 10 baje utha aur nashta kiya aur phir university kai.
