In [None]:
import re
import random
import nltk
from collections import Counter
from nltk.metrics import edit_distance

file_paths = [r"D:\tourism.hi.txt", r"D:\second_dataset.hi.txt", r"D:\third_dataset.hi.txt"]
all_words = []

for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    text = re.sub(r"[^\w\s]", "", text)
    words = text.split()
    all_words.extend(words)

num_parts = 3
split_datasets = [all_words[i::num_parts] for i in range(num_parts)]

def generate_ngrams(words, n):
    return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

bigram_counts = Counter(generate_ngrams(all_words, 2))
trigram_counts = Counter(generate_ngrams(all_words, 3))
fourgram_counts = Counter(generate_ngrams(all_words, 4))

unigram_counts = Counter(all_words)
total_unigrams = sum(unigram_counts.values())

def calculate_probabilities(ngram_counts, lower_order_counts):
    probabilities = {}
    for ngram, count in ngram_counts.items():
        prefix = ngram[:-1]
        prefix_count = lower_order_counts.get(prefix, total_unigrams)
        probabilities[ngram] = count / prefix_count
    return probabilities

bigram_probs = calculate_probabilities(bigram_counts, unigram_counts)
trigram_probs = calculate_probabilities(trigram_counts, bigram_counts)
fourgram_probs = calculate_probabilities(fourgram_counts, trigram_counts)

def introduce_typos(word):
    if len(word) > 3:
        typo_type = random.choice(["replace", "swap", "delete", "insert"])
        pos = random.randint(1, len(word) - 2)
        
        if typo_type == "replace":
            return word[:pos] + random.choice("अआइईउऊएऐओऔकखगघङचछजझटठडढणतथदधनपफबभमयरलवशषसह") + word[pos+1:]
        elif typo_type == "swap" and len(word) > 4:
            return word[:pos-1] + word[pos] + word[pos-1] + word[pos+1:]
        elif typo_type == "delete":
            return word[:pos] + word[pos+1:]
        elif typo_type == "insert":
            return word[:pos] + random.choice("अआइईउऊ") + word[pos:]
    return word

def correct_spelling_levenshtein(word, word_list):
    subset_words = random.sample(word_list, min(len(word_list), 5000))
    return min(subset_words, key=lambda w: edit_distance(word, w))

def check_spelling(word):
    corrected_word = correct_spelling_levenshtein(word, all_words)
    print(f"Suggested correction: {corrected_word}")
    return corrected_word

correct_count = sum(1 for i in range(50) if correct_spelling_levenshtein(introduce_typos(all_words[i]), all_words) == all_words[i])
accuracy_levenshtein = correct_count / 50
print(f"Spellchecking Accuracy using Levenshtein Distance: {accuracy_levenshtein:.2%}")

test_sizes = [50, 100, 200]
for size in test_sizes:
    correct_count = sum(1 for i in range(size) if correct_spelling_levenshtein(introduce_typos(all_words[i]), all_words) == all_words[i])
    accuracy = correct_count / size
    print(f"Accuracy with {size} words: {accuracy:.2%}")

def hybrid_correction(word, word_list, ngram_probs, n):
    candidates = [w for w in word_list if len(w) == len(word)]
    best_word = min(candidates, key=lambda w: (edit_distance(word, w) * 2, -ngram_probs.get(tuple(w[-n:]), 0)))
    return best_word

correct_count = sum(1 for i in range(50) if hybrid_correction(introduce_typos(all_words[i]), all_words, bigram_probs, 2) == all_words[i])
accuracy_hybrid = correct_count / 50
print(f"Hybrid Model Accuracy (Bigram + Edit Distance): {accuracy_hybrid:.2%}")

user_input = input("Enter a word to check its spelling: ")
check_spelling(user_input)

print(f"The dataset was split into {num_parts} parts.")

Spellchecking Accuracy using Levenshtein Distance: 86.00%
Accuracy with 50 words: 88.00%
Accuracy with 100 words: 83.00%
Accuracy with 200 words: 80.00%
