In [1]:
#Question 1

import re

us_terms = ["scheduled for", "held on"]
uk_terms = ["registration deadline", "due on", "final report"]

#Defining the file paths
input_file_path = 'date_format_dd_mm_yyyy.txt'
output_file_path = 'KushalChandani_kc07535.txt'

#Check context and determining date format
def determine_date_format(date, context):
    day, month, year = map(int, date.split('/'))

    #Logical assumptions for non-ambiguous dates
    if day > 12:
        return "DD/MM/YYYY"
    elif month > 12:
        return "MM/DD/YYYY"

    #Contextual clues
    context = context.lower()
    if any(term in context for term in us_terms):
        return "DD/MM/YYYY"
    elif any(term in context for term in uk_terms):
        return "MM/DD/YYYY"

    #If both day and month are ≤ 12 and no clear contextual clue, mark as ambiguous
    return "Ambiguous"

#Reading the input file
with open(input_file_path, 'r') as file:
    text = file.read()

#Extracting dates using regex
dates = re.findall(r'\b\d{2}/\d{2}/\d{4}\b', text)

#Preparing output list
output = []

#Checking each date and determine its format
for date in dates:
    #Finding context around the date (50 characters before and after)
    context_range = 50
    date_index = text.find(date)
    context_start = max(0, date_index - context_range)
    context_end = min(len(text), date_index + context_range)
    context = text[context_start:context_end]

    #Determining the date format
    format = determine_date_format(date, context)
    output.append(f"{date}: {format}")

#Writing the results to the output file
with open(output_file_path, 'w') as file:
    for line in output:
        file.write(line + '\n')

print(f"Results saved in {output_file_path}.")


Results saved in KushalChandani_kc07535.txt.


In [2]:
#Question 2

import re
from collections import defaultdict

#Tokenizing text with punctuation
def tokenize_with_punctuation(text):
    return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)

#Reading text from the file and create vocabulary
def reading(file_path):
    vocab = defaultdict(int)
    with open(file_path, 'r') as file:
        content = file.read()

    tokens = tokenize_with_punctuation(content)

    for token in tokens:
        vocab[token] += 1
    # print(vocab)
    return vocab

#Computing pair scores based on frequency
def compute_pair_scores(splits, vocab):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)

    for word, freq in vocab.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    # print(scores)
    return scores

#Merging the most frequent pair
def merge_pair(a, b, splits, vocab):
    for word in vocab:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2:]
            else:
                i += 1
        splits[word] = split
    # print(splits)
    return splits

#Running the WordPiece algorithm with specified merges
def wordpiece_algorithm(file_path, num_merges=10):
    vocab = reading(file_path)

    splits = {
        word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
        for word in vocab.keys()
    }

    alphabet = []
    for word in vocab.keys():
        if word[0] not in alphabet:
            alphabet.append(word[0])
        for letter in word[1:]:
            if f"##{letter}" not in alphabet:
                alphabet.append(f"##{letter}")

    alphabet.sort()

    for i in range(num_merges):
        scores = compute_pair_scores(splits, vocab)
        best_pair, max_score = "", None
        for pair, score in scores.items():
            if max_score is None or max_score < score:
                best_pair = pair
                max_score = score
        splits = merge_pair(*best_pair, splits, vocab)
        new_token = (
            best_pair[0] + best_pair[1][2:]
            if best_pair[1].startswith("##")
            else best_pair[0] + best_pair[1]
        )
        print(f"Merged: {best_pair} -> {new_token}")
        alphabet.append(new_token)

if __name__ == "__main__":
    file_path = "wordpiece_input.txt"
    wordpiece_algorithm(file_path, num_merges=10)


Merged: ('1', '##0') -> 10
Merged: ('o', '##f') -> of
Merged: ('##f', '##y') -> ##fy
Merged: ('e', '##x') -> ex
Merged: ('##m', '##p') -> ##mp
Merged: ('##q', '##u') -> ##qu
Merged: ('##b', '##u') -> ##bu
Merged: ('##mp', '##l') -> ##mpl
Merged: ('##bu', '##l') -> ##bul
Merged: ('ex', '##a') -> exa


In [3]:
# Question 3

import nltk
import re

#Reading the Urdu text from the file
with open('urdu_text_input.txt', 'r', encoding='utf-8') as file:
    urdu_text = file.read()

#Tokenization using NLTK
def tokenize_nltk(text):
    nltk.download('punkt', quiet=True)
    return nltk.word_tokenize(text)

#Tokenization using regex
def tokenize_regex(text):
    return re.findall(r'[\u0600-\u06FF]+', text)

#Getting tokens from different methods
tokens_nltk = tokenize_nltk(urdu_text)
tokens_regex = tokenize_regex(urdu_text)

#Writing the tokens to output files
with open('output_nltk.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join(tokens_nltk))

with open('output_regex.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join(tokens_regex))