# Pre-Process Data
## Unprocessed CSV files located in the './nounbanks' directory.

In [23]:
import csv
import pandas as pd

In [24]:
def preprocess_one_col_csv(filename, col_name):
    '''Read a newline delimited CSV with one column and return a pandas dataframe.'''
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter='\n') #read animals.csv with '\n' as the delimiter
        noun_list = list(reader)
    noun_list = [x[0] for x in noun_list] # make a list of the animals
    lowercase_noun_list = [x.lower() for x in noun_list] # lowercase
    lowercase_noun_list = list(dict.fromkeys(lowercase_noun_list)) # remove duplicates
    lowercase_noun_list = [x for x in lowercase_noun_list if x != ''] # remove empty strings
    lowercase_noun_list.sort() # sort alphabetically
    #convert to pandas dataframe
    df = pd.DataFrame(lowercase_noun_list, columns=[col_name])
    return df

def preprocess_states_csv(filename):
    '''Format USA_State.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'state' and 'area (sq. mi)'
    df = pd.read_csv(filename, names=['state', 'area (sq. mi)','pop_2014'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_countries_csv(filename):
    '''Format CPUNTRIES.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'country' and 'area (sq. km)'
    df = pd.read_csv(filename, names=['country', 'area (sq. km)','pop_2021'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_elements_csv(filename):
    '''Format ELEMENTS.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'element' and 'atomic number'
    df = pd.read_csv(filename)
    # df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

### Driver

In [25]:
# 'all_*' implies an exhaustive set, 'many_*' implies a long list list (growable)

all_usa_states = preprocess_states_csv('nounbanks/USA_State.csv') 
all_world_countries = preprocess_countries_csv('nounbanks/COUNTRIES.csv')
all_elements = preprocess_elements_csv('nounbanks/ELEMENTS.csv')
many_animals = preprocess_one_col_csv('nounbanks/ANIMALS.csv', 'many_animals')
many_greek_gods = preprocess_one_col_csv('nounbanks/GREEKGODS.csv', 'many_greek_gods')
many_roman_gods = preprocess_one_col_csv('nounbanks/ROMANGODS.csv', 'many_roman_gods')
many_cosmetic_items = preprocess_one_col_csv('nounbanks/COSMETICITEMS.csv', 'many_cosmetic_items')
many_office_supplies = preprocess_one_col_csv('nounbanks/OFFICESUPPLIES.csv', 'many_office_supplies')


# use '%store' magic method to save variables to the notebook's namespace

%store all_usa_states
%store all_world_countries
%store all_elements
%store many_animals
%store many_greek_gods
%store many_roman_gods
%store many_cosmetic_items
%store many_office_supplies



Stored 'all_usa_states' (DataFrame)
Stored 'all_world_countries' (DataFrame)
Stored 'all_elements' (DataFrame)
Stored 'many_animals' (DataFrame)
Stored 'many_greek_gods' (DataFrame)
Stored 'many_roman_gods' (DataFrame)
Stored 'many_cosmetic_items' (DataFrame)
Stored 'many_office_supplies' (DataFrame)


### Display Processed Dataframes

In [26]:
from IPython.display import display
# # display noun banks
