# Pre-Process Data
## Unprocessed CSV files located in the './nounbanks' directory.

In [1]:
import csv
import pandas as pd

In [3]:
def preprocess_one_col_csv(filename, col_name):
    '''Read a newline delimited CSV with one column and return a pandas dataframe.'''
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter='\n') #read animals.csv with '\n' as the delimiter
        noun_list = list(reader)
    noun_list = [x[0] for x in noun_list] # make a list of the animals
    lowercase_noun_list = [x.lower() for x in noun_list] # lowercase
    lowercase_noun_list = list(dict.fromkeys(lowercase_noun_list)) # remove duplicates
    lowercase_noun_list = [x for x in lowercase_noun_list if x != ''] # remove empty strings
    lowercase_noun_list.sort() # sort alphabetically
    #convert to pandas dataframe
    df = pd.DataFrame(lowercase_noun_list, columns=[col_name])
    return df

def preprocess_states_csv(filename):
    '''Format USA_State.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'state' and 'area (sq. mi)'
    df = pd.read_csv(filename, names=['all_usa_states', 'area (sq. mi)','pop_2014'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_countries_csv(filename):
    '''Format CPUNTRIES.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'country' and 'area (sq. km)'
    df = pd.read_csv(filename, names=['all_world_countries', 'area (sq. km)','pop_2021'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_elements_csv(filename):
    '''Format ELEMENTS.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'element' and 'atomic number'
    df = pd.read_csv(filename)
    # rename element column to 'all_elements'
    df = df.rename(columns={'Element': 'all_elements'})
    # df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

### Driver

In [7]:
# 'all_*' implies an exhaustive set, 'many_*' implies a long list list (growable)

all_usa_states = preprocess_states_csv('nounbanks/USA_STATES.csv') 
all_world_countries = preprocess_countries_csv('nounbanks/COUNTRIES.csv')
all_elements = preprocess_elements_csv('nounbanks/ELEMENTS.csv')
many_animals = preprocess_one_col_csv('nounbanks/ANIMALS.csv', 'many_animals')
many_greek_gods = preprocess_one_col_csv('nounbanks/GREEKGODS.csv', 'many_greek_gods')
many_roman_gods = preprocess_one_col_csv('nounbanks/ROMANGODS.csv', 'many_roman_gods')
many_cosmetic_items = preprocess_one_col_csv('nounbanks/COSMETICITEMS.csv', 'many_cosmetic_items')
many_office_supplies = preprocess_one_col_csv('nounbanks/OFFICESUPPLIES.csv', 'many_office_supplies')

# new additions

# all_words = preprocess_one_col_csv('nounbanks/ALL_WORDS.csv', 'all_words')
# many_car_manufacturers = preprocess_one_col_csv('nounbanks/CAR_MANUFACTURERS.csv', 'many_car_manufacturers')
# many_celebrities = preprocess_one_col_csv('nounbanks/CELEBS.csv', 'many_celebrities')
# many_colors = preprocess_one_col_csv('nounbanks/COLORS.csv', 'many_colors')
# many_companies = preprocess_one_col_csv('nounbanks/COMPANIES.csv', 'many_companies')
# many_deserts = preprocess_one_col_csv('nounbanks/DESERTS.csv', 'many_deserts')
# many_dictators = preprocess_one_col_csv('nounbanks/DICTATORS.csv', 'many_dictators')
# many_drinks = preprocess_one_col_csv('nounbanks/DRINKS.csv', 'many_drinks')
# many_fast_food_chains = preprocess_one_col_csv('nounbanks/FAST_FOOD.csv', 'many_fast_food_chains')
# many_flowers = preprocess_one_col_csv('nounbanks/FLOWERS_PLANTS.csv', 'many_flowers')
# many_food_ingrediants = preprocess_one_col_csv('nounbanks/FOOD_INGREDIENTS.csv', 'many_food_ingrediants')
# many_fruits = preprocess_one_col_csv('nounbanks/FRUITS.csv', 'many_fruits')
# many_gems = preprocess_one_col_csv('nounbanks/GEMSTONES.csv', 'many_gems')
# many_holidays = preprocess_one_col_csv('nounbanks/HOLIDAYS.csv', 'many_holidays')
# many_kitchen_items = preprocess_one_col_csv('nounbanks/KITCHEN_OBJECTS.csv', 'many_kitchen_utensils')
# many_luxury_brands = preprocess_one_col_csv('nounbanks/LUXURY_BRANDS.csv', 'many_luxury_brands')
# many_movies = preprocess_one_col_csv('nounbanks/MOVIES.csv', 'many_movies')
# many_pets = preprocess_one_col_csv('nounbanks/PETS.csv', 'many_pets')
# all_usa_presidents = preprocess_one_col_csv('nounbanks/US_PRESIDENTS.csv', 'all_usa_presidents')

# use '%store' magic method to save variables to the notebook's namespace

%store all_usa_states
%store all_world_countries
%store all_elements
%store many_animals
%store many_greek_gods
%store many_roman_gods
%store many_cosmetic_items
%store many_office_supplies



Stored 'all_usa_states' (DataFrame)
Stored 'all_world_countries' (DataFrame)
Stored 'all_elements' (DataFrame)
Stored 'many_animals' (DataFrame)
Stored 'many_greek_gods' (DataFrame)
Stored 'many_roman_gods' (DataFrame)
Stored 'many_cosmetic_items' (DataFrame)
Stored 'many_office_supplies' (DataFrame)


### Display Processed Dataframes

In [39]:
from IPython.display import display
# # display noun banks

# add a col to all_usa_states called 'Starting Letter' and populate it with the set of starting letters of each state
all_usa_states['Starting Letter'] = all_usa_states['all_usa_states'].apply(lambda x: x[0])
display(all_usa_states)



Unnamed: 0,all_usa_states,area (sq. mi),pop_2014,Starting Letter
0,Alabama,52423,4849377,A
1,Alaska,656425,736732,A
2,Arizona,114006,6731484,A
3,Arkansas,53182,2966369,A
4,California,163707,38802500,C
5,Colorado,104100,5355866,C
6,Connecticut,5544,3596677,C
7,Delaware,1954,935614,D
8,District of Columbia,68,658893,D
9,Florida,65758,19893297,F
