# Dataset Loading and Initial Exploration

In [11]:
#imports
import os
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import re
import string
import nltk
from nltk.corpus import words

In [12]:
#function for loading the files
def load_files(folder_path):
    
    text_data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".sgm") or filename.endswith(".xml"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                soup = BeautifulSoup(file, 'html.parser')
                text = soup.get_text()
                words = word_tokenize(text)
                text_data.extend(words)

    return list(text_data)

In [13]:
folder_path = r"C:\Users\LENOVO\Documents\Capstone Project\Datasets"
text_data = load_files(folder_path)

# Display a few words from the dataset
print(text_data[200:250])



['akati', ',', '``', 'Mvura', 'iri', 'pasi', 'pedenga', 'ngaiungane', 'pamwe', 'chete', 'kuti', 'pasi', 'pakaoma', 'pagoonekwa', '.', "''", 'Zvikaita', 'saizvozvo', '.', '10', 'Mwari', 'akatumidza', 'pasi', 'pakaoma', 'kuti', '``', 'Nyika', "''", 'uye', 'mvura', 'yakanga', 'yakaungana', ',', 'akaitumidza', 'kuti', '``', 'Makungwa', "''", '.', 'Mwari', 'akaona', 'kuti', 'zvakanaka', '.', '11', 'Mwari', 'akati', ',', '``', 'Nyika']


# Text Preprocessing

In [14]:
nltk.download('words')
english_words = set(words.words())

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [15]:
def preprocess(words):

    #removing utf-8 encoding characters and other unnecessary chars
    words = [re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\[\]\\\£\Â\n\r]', ' ', word) for word in words]
    words = [word for word in words if word.isalpha()]

    #removing all numerical values(page numbers, verses, chapters etc)
    words = [re.sub(r'[0123456789]', ' ', word) for word in words]

    #removing all punctuation
    re_punc = re.compile( ' [%s] ' % re.escape(string.punctuation))
    words = [re_punc.sub( '' , word) for word in words]

    #removing Roman numerals
    # first capitalized ones
    words = [re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', word) for word in words]
    # then lowercase
    words = [re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', word) for word in words]

    #removing all strings of capital letters that are more than 2 characters long(Headings)
    words = [re.sub(r'[A-Z]{2,}', ' ', word) for word in words]

    #removing extra white spaces
    words = [re.sub(r'\s+', ' ', word) for word in words]

    #removing English words  from the corpus
    words = [word for word in words if word not in english_words]

    #converting all words to lower case
    clean_words = [word.lower() for word in words]

    #removing empty strings
    for word in clean_words:
        if((len(word)) == 1):
            clean_words.remove(word)

    return clean_words

In [16]:
clean_text = preprocess(text_data)
clean_text[200:210]

['nezviedza',
 'kuti',
 'zvisiyanise',
 'masikati',
 'nousiku',
 'zvigova',
 'zviratidzo',
 'zvenguva',
 'nezvamazuva',
 'ngazvive']

**Removing English Words from the corpus**

In [8]:
eng_dict_path = r"C:\Users\LENOVO\Downloads\Oxford English Dictionary.txt"

with open(eng_dict_path, 'r', encoding='utf-8', errors='ignore') as file:
    eng_dict = [word.strip() for word in file.readlines()]
eng_dict = word_tokenize(str(eng_dict))

In [9]:
clean_shona_text = [word for word in clean_text if word not in eng_dict]
clean_shona_text[:10]

KeyboardInterrupt: 

In [None]:
#saving the clean text to a file, for future reference
def save_to_file(clean_words, output_file_path):
    
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(clean_words))


output_file_path = r"C:\Users\LENOVO\Documents\Capstone Project\clean_text.txt"
save_to_file(clean_shona_text, output_file_path)


# Data Preparation: Noisemaking

In [19]:
import pandas as pd
import random

def introduce_keyboard_errors(word, max_modifications=2):
    keyboard_layout = {
        'a': 'qwsz',
        'b': 'vghn',
        'c': 'xdfv',
        'd': 'erfcxs',
        'e': 'rdsw',
        'f': 'rtgvcxd',
        'g': 'tyhbvf',
        'h': 'yujnbg',
        'i': 'uokj',
        'j': 'uikmnh',
        'k': 'iojlm',
        'l': 'opk',
        'm': 'njk',
        'n': 'bhjm',
        'o': 'iplk',
        'p': 'olo',
        'q': 'wa',
        'r': 'tfe',
        's': 'awedxz',
        't': 'ygr',
        'u': 'ijyh',
        'v': 'cfgb',
        'w': 'qasde',
        'x': 'zsdc',
        'y': 'uhtg',
        'z': 'asx'
    }

    noisy_word = word
    for _ in range(random.randint(1, max_modifications)):
        # Select a random position in the word
        position = random.randint(0, len(word) - 1)
        # Simulate keyboard layout error by randomly selecting a nearby key
        noisy_char = random.choice(keyboard_layout.get(word[position], word[position]))
        # Modify the word at the selected position
        noisy_word = noisy_word[:position] + noisy_char + noisy_word[position + 1:]

    return noisy_word

def create_dataset(correct_words, num_mispellings=3, max_modifications=2):
    dataset = {"Incorrect Spelling": [], "Correct Spelling": []}

    for word in correct_words:
        # Generate multiple misspellings for each word
        for _ in range(num_mispellings):
            mispelled_word = introduce_keyboard_errors(word, max_modifications)
            dataset["Incorrect Spelling"].append(mispelled_word)
            dataset["Correct Spelling"].append(word)

    #create a dataframe
    return pd.DataFrame(dataset)




In [20]:
# Testing the function on a small sample 
correct_words = clean_text[300:306]
spell_checker_dataset = create_dataset(correct_words, num_mispellings=5, max_modifications=2)

# Display the dataset
print(spell_checker_dataset)

   Incorrect Spelling  Correct Spelling
0             muzadae           muzadze
1             muzadzr           muzadze
2             muzadzr           muzadze
3             muaadzs           muzadze
4             muaadze           muzadze
5               mvira             mvura
6               mfura             mvura
7               nvura             mvura
8               mcurz             mvura
9               mvhra             mvura
10         yomugunvwa        yomugungwa
11         ykmugungwa        yomugungwa
12         yomugubgwa        yomugungwa
13         yonugunywa        yomugungwa
14         uomugungwa        yomugungwa
15                yys               uye
16                hyd               uye
17                iyw               uye
18                ute               uye
19                jye               uye
20              shkro             shiri
21              shifj             shiri
22              zhiri             shiri
23              shjri             shiri
