In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from typing import Dict, List

from resources.constants import punctuation, stop_words

In [2]:
definitions = pd.read_csv('resources/definitions.tsv', sep='\t')
definitions.head()

# remove index from the dataframe (for each row it is the first element)
definitions = definitions.iloc[:, 1:]
definitions.head()

# convert the dataframe to a dictionary for easier access
definitions_dict: Dict[str, List[str]] = {}
for column in definitions.columns:
    definitions_dict[column] = definitions[column].tolist()

In [3]:
# print every word and one of its definitions
for word in definitions_dict:
    print(f'- {word.upper()}: \n\t{definitions_dict[word][0]}')

- DOOR: 
	A construction used to divide two rooms, temporarily closing the passage between them
- LADYBUG: 
	small flying insect, typically red with black spots with six legs
- PAIN: 
	A feeling of physical or mental distress
- BLURRINESS: 
	sight out of focus


In [None]:
def merge_definitions(definitions: Dict[str, List[str]]):
    merged_definitions = {}
    for word in definitions:
        merged_definitions[word] = ' '.join(definitions[word])
        
    return merged_definitions

In [None]:
lemmatizer = WordNetLemmatizer()

def clean_word_list(word_list: List[str]):
    # remove punctuation
    word_list = [word for word in word_list if word not in punctuation]
    # remove stop words
    word_list = [word for word in word_list if word not in stop_words]
    # lemmatize the words
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    return word_list