In [27]:
import pandas as pd
import csv
import pathlib
import os
from string import punctuation, digits

from utility_functions import count_csv_lines, save_as_csv, strip_bounded

In [28]:
# Master Data

master_data_directory = pathlib.Path('master_data')
cleaned_data_directory = pathlib.Path('cleaned_data')

# CMU Dict File
original_cmu_dict = os.path.join(master_data_directory, 'cmudict.dict.txt')

# Animal Names
original_wikipedia_animals = os.path.join(master_data_directory, 'wikipedia_animal_list.csv')

# New Original Adjectives
original_wordnet_adjectives = os.path.join(master_data_directory, 'wordnet_adjectives_index')

In [29]:
# Clean the Wordnet Adjectives File

adjectives_list = []
with open(original_wordnet_adjectives) as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        word = row[0]
        if word and word[0] not in punctuation and word[0] not in digits and word[0] != ' ':
            word = word.split(' ')[0]
            word = word.replace('_', ' ')
            word = word.lower()
            if len(word) > 1:
                adjectives_list.append(word)
            
clean_wordnet_adjectives_path = os.path.join(cleaned_data_directory, 'clean_wordnet_adjectives')
save_as_csv(adjectives_list, clean_wordnet_adjectives_path)

In [30]:
# Clean the CMU Dict File

path = original_cmu_dict
with open(path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=' ')
    word_list = []
    for row in csv_reader:
        word = row[0]
        if not any(symbol in word for symbol in punctuation):
            word_list.append(row[0])

clean_cmu_dict_path = os.path.join(cleaned_data_directory, 'clean_cmu_dict')
save_as_csv(word_list, clean_cmu_dict_path)

In [31]:
# Clean the Wikipedia Animal List

def parse_animal(animal: str):
    if len(animal) <= 1:
        return False
    if '(' in animal:
        animal = strip_bounded(animal, '(')
    if '[' in animal:
        animal = strip_bounded(animal, '[')

    if 'Also' in animal:
        start = animal.find('Also')
        animal = animal[0:start]
    if 'See' in animal:
        start = animal.find('Also')
        animal = animal[0:start]
    return animal

path = original_wikipedia_animals
with open(path) as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_file) # skip the first line (header)
    animal_list = []
    for row in csv_reader:
        animal = parse_animal(row[0])
        if animal:
            animal = animal.lower()
            if '/' in animal:
                split = animal.split('/')
                animal_list += split 
            else: 
                animal: animal_list.append(animal)

clean_wikipedia_animals_path = os.path.join(cleaned_data_directory, 'clean_wikipedia_animals')
save_as_csv(animal_list, clean_wikipedia_animals_path)

In [32]:
cmu_list = pd.read_csv(clean_cmu_dict_path + '.csv', header=None)
animals = pd.read_csv(clean_wikipedia_animals_path + '.csv', header=None)
wordnet_adjectives = pd.read_csv(clean_wordnet_adjectives_path + '.csv', header=None)

In [33]:
animal_intersection = pd.merge(animals, cmu_list, left_on=0, right_on=0)
wordnet_adjectives_intersection = pd.merge(wordnet_adjectives, cmu_list, left_on=0, right_on=0)

In [34]:
# Vital Statistics

original_animal_names_length = count_csv_lines(original_wikipedia_animals)
original_wordnet_adjectives_length = count_csv_lines(original_wordnet_adjectives)

print(f'Animal names went from {original_animal_names_length} to {len(animal_intersection)}')
print(f'Wordnet Adjectives names went from {original_wordnet_adjectives_length} to {len(wordnet_adjectives_intersection)}')

Animal names went from 312 to 199
Wordnet Adjectives names went from 21528 to 9415


In [35]:
intersection_path = pathlib.Path('intersection_data')

save_as_csv(animal_intersection, f'{intersection_path}/animal_intersection')
save_as_csv(wordnet_adjectives_intersection, f'{intersection_path}/wordnet_adjective_intersection')