In [1]:
import json
from squad import *
import utils

all_words_dict = utils.get_all_words_dict()
word_counter = [0 for _ in all_words_dict.keys()]

key_index_dict = {idx: key for idx, key in enumerate(all_words_dict.keys())}

# Helper Functions

- `get_word_indices` returns the index position in `all_words_dict` of each word in a sentence
- `get_occurrence_dict` returns a dictionary with words as keys and the number of occurrences as value

In [2]:
def get_word_indices(sentence: str) -> List[str]:
    question = utils.tokenize(sentence)
    question = [utils.stemming(word) for word in question]
    word_indices = [all_words_dict.get(word) for word in question if all_words_dict.get(word) is not None]
    return word_indices


def get_occurrence_dict() -> Dict[str, int]:
    occurrence_dict: Dict[str, int] = {}
    for zip_element in zip(all_words_dict.keys(), word_counter):
        word_key = zip_element[0]
        word_occurrence = zip_element[1]
        occurrence_dict[word_key] = word_occurrence

    return occurrence_dict

# Load Squad File

In [3]:
SQUAD_FILE_PATH: str = 'squad_dataset.json'

squad = Squad(SQUAD_FILE_PATH)
squad_transform = Squad_Transform(squad)


# Count the occurrence of each word

In [4]:

for qas_key, qas_list in squad_transform.title_question_answer_dict.items():
    for element in qas_list:
        word_indices = get_word_indices(f'{element.question}')
        
        for index in word_indices:
            word_counter[index] += 1
        

# Verify occurrence dict positions

Verify that the positions and word occurrences for each word matches the `word_counter` list. 

In [11]:
occurrence_dict = get_occurrence_dict()

success = True
for idx, value in enumerate(occurrence_dict.values()):
    if value != word_counter[idx]:
        success = False

print(success)

True
