# Pre-Process Data
## Unprocessed CSV files located in the './nounbanks' directory.

In [1]:
import csv
import pandas as pd

In [26]:
def preprocess_one_col_csv(filename, col_name):
    '''Read a newline delimited CSV with one column and return a pandas dataframe.'''
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter='\n') #read animals.csv with '\n' as the delimiter
        noun_list = list(reader)
    noun_list = [x[0] for x in noun_list] # make a list of the animals
    lowercase_noun_list = [x.lower() for x in noun_list] # lowercase
    lowercase_noun_list = list(dict.fromkeys(lowercase_noun_list)) # remove duplicates
    lowercase_noun_list = [x for x in lowercase_noun_list if x != ''] # remove empty strings
    lowercase_noun_list.sort() # sort alphabetically
    #convert to pandas dataframe
    df = pd.DataFrame(lowercase_noun_list, columns=[col_name])
    return df

def preprocess_states_csv(filename):
    '''Format USA_State.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'state' and 'area (sq. mi)'
    df = pd.read_csv(filename, names=['all_usa_states', 'area (sq. mi)','pop_2014'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_countries_csv(filename):
    '''Format CPUNTRIES.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'country' and 'area (sq. km)'
    df = pd.read_csv(filename, names=['all_world_countries', 'area (sq. km)','pop_2021'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_elements_csv(filename):
    '''Format ELEMENTS.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'element' and 'atomic number'
    df = pd.read_csv(filename)
    # rename element column to 'all_elements'
    df = df.rename(columns={'Element': 'all_elements'})
    # df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

### Driver

In [27]:
# 'all_*' implies an exhaustive set, 'many_*' implies a long list list (growable)

all_usa_states = preprocess_states_csv('nounbanks/USA_STATES.csv') 
all_world_countries = preprocess_countries_csv('nounbanks/COUNTRIES.csv')
all_elements = preprocess_elements_csv('nounbanks/ELEMENTS.csv')
many_animals = preprocess_one_col_csv('nounbanks/ANIMALS.csv', 'many_animals')
many_greek_gods = preprocess_one_col_csv('nounbanks/GREEK_GODS.csv', 'many_greek_gods')
many_roman_gods = preprocess_one_col_csv('nounbanks/ROMAN_GODS.csv', 'many_roman_gods')
many_cosmetic_items = preprocess_one_col_csv('nounbanks/COSMETIC_ITEMS.csv', 'many_cosmetic_items')
many_office_supplies = preprocess_one_col_csv('nounbanks/OFFICE_SUPPLIES.csv', 'many_office_supplies')
many_words = preprocess_one_col_csv('nounbanks/ALL_WORDS.csv', 'many_words')
many_beauty_brands = preprocess_one_col_csv('nounbanks/BEAUTY_BRANDS.csv', 'many_beauty_brands')
many_body_parts = preprocess_one_col_csv('nounbanks/BODY_PARTS.csv', 'many_body_parts')
many_candy = preprocess_one_col_csv('nounbanks/CANDY.csv', 'many_candy')
many_car_manufacturers = preprocess_one_col_csv('nounbanks/CAR_MANUFACTURERS.csv', 'many_car_manufacturers')
many_celebs = preprocess_one_col_csv('nounbanks/CELEBS.csv', 'many_celebs')
many_clothes = preprocess_one_col_csv('nounbanks/CLOTHES.csv', 'many_clothes')
many_college_majors = preprocess_one_col_csv('nounbanks/COLLEGE_MAJORS.csv', 'many_college_majors')
# many_colors = preprocess_one_col_csv('nounbanks/COLORS.csv', 'many_colors')
many_companies = preprocess_one_col_csv('nounbanks/COMPANIES.csv', 'many_companies')
many_desserts = preprocess_one_col_csv('nounbanks/DESSERTS.csv', 'many_desserts')
many_dictators = preprocess_one_col_csv('nounbanks/DICTATORS.csv', 'many_dictators')
many_disney_movies = preprocess_one_col_csv('nounbanks/DISNEY_MOVIES.csv', 'many_disney_movies')
many_dogs = preprocess_one_col_csv('nounbanks/DOGS.csv', 'many_dogs')
many_drinks = preprocess_one_col_csv('nounbanks/DRINKS.csv', 'many_drinks')
many_empires = preprocess_one_col_csv('nounbanks/EMPIRES.csv', 'many_empires')
#many_fast_food = preprocess_one_col_csv('nounbanks/FAST_FOOD.csv', 'many_fast_food')
many_flowers_plants = preprocess_one_col_csv('nounbanks/FLOWERS_PLANTS.csv', 'many_flowers_plants')
# many_food_ingredients = preprocess_one_col_csv('nounbanks/FOOD_INGREDIENTS.csv', 'many_food_ingredients')
many_fruits = preprocess_one_col_csv('nounbanks/FRUITS.csv', 'many_fruits')
many_gemstones = preprocess_one_col_csv('nounbanks/GEMSTONES.csv', 'many_gemstones')
many_holidays = preprocess_one_col_csv('nounbanks/HOLIDAYS.csv', 'many_holidays')
# many_kitchen_objects = preprocess_one_col_csv('nounbanks/KITCHEN_OBJECTS.csv', 'many_kitchen_objects')
many_landmarks = preprocess_one_col_csv('nounbanks/LANDMARKS.csv', 'many_landmarks')
many_luxury_brands = preprocess_one_col_csv('nounbanks/LUXURY_BRANDS.csv', 'many_luxury_brands')
many_marvel = preprocess_one_col_csv('nounbanks/MARVEL.csv', 'many_marvel')
#many_movies = preprocess_one_col_csv('nounbanks/MOVIES.csv', 'many_movies')
many_mythical_creatures = preprocess_one_col_csv('nounbanks/MYTHICAL_CREATURES.csv', 'many_mythical_creatures')
many_nfl_teams = preprocess_one_col_csv('nounbanks/NFL_TEAMS.csv', 'many_nfl_teams')
many_office_supplies = preprocess_one_col_csv('nounbanks/OFFICE_SUPPLIES.csv', 'many_office_supplies')
many_olympic_sports = preprocess_one_col_csv('nounbanks/OLYMPIC_SPORTS.csv', 'many_olympic_sports')
many_pets = preprocess_one_col_csv('nounbanks/PETS.csv', 'many_pets')
#many_planets = preprocess_one_col_csv('nounbanks/PLANETS.csv', 'many_planets')
many_professions = preprocess_one_col_csv('nounbanks/PROFESSIONS.csv', 'many_professions')
#many_tv_shows = preprocess_one_col_csv('nounbanks/TV_SHOWS.csv', 'many_tv_shows')
#many_us_presidents = preprocess_one_col_csv('nounbanks/US_PRESIDENTS.csv', 'many_us_presidents')
many_vegetables = preprocess_one_col_csv('nounbanks/VEGETABLES.csv', 'many_vegetables')
#many_wars = preprocess_one_col_csv('nounbanks/WARS.csv', 'many_wars')
many_weather_conditions = preprocess_one_col_csv('nounbanks/WEATHER_CONDITIONS.csv', 'many_weather_conditions')
# many_youtubers = preprocess_one_col_csv('nounbanks/YOUTUBERS.csv', 'many_youtubers')




# use '%store' magic method to save variables to the notebook's namespace

%store all_usa_states
%store all_world_countries
%store all_elements
%store many_animals
%store many_greek_gods
%store many_roman_gods
%store many_cosmetic_items
%store many_office_supplies
%store many_olympic_sports
%store many_nfl_teams
%store many_marvel
%store many_mythical_creatures
%store many_weather_conditions
%store many_vegetables
%store many_professions
%store many_pets
%store many_landmarks
%store many_luxury_brands
%store many_holidays
%store many_gemstones
%store many_fruits
%store many_flowers_plants
%store many_empires




Stored 'all_usa_states' (DataFrame)
Stored 'all_world_countries' (DataFrame)
Stored 'all_elements' (DataFrame)
Stored 'many_animals' (DataFrame)
Stored 'many_greek_gods' (DataFrame)
Stored 'many_roman_gods' (DataFrame)
Stored 'many_cosmetic_items' (DataFrame)
Stored 'many_office_supplies' (DataFrame)
Stored 'many_olympic_sports' (DataFrame)
Stored 'many_nfl_teams' (DataFrame)
Stored 'many_marvel' (DataFrame)
Stored 'many_mythical_creatures' (DataFrame)
Stored 'many_weather_conditions' (DataFrame)
Stored 'many_vegetables' (DataFrame)
Stored 'many_professions' (DataFrame)
Stored 'many_pets' (DataFrame)
Stored 'many_landmarks' (DataFrame)
Stored 'many_luxury_brands' (DataFrame)
Stored 'many_holidays' (DataFrame)
Stored 'many_gemstones' (DataFrame)
Stored 'many_fruits' (DataFrame)
Stored 'many_flowers_plants' (DataFrame)
Stored 'many_empires' (DataFrame)


### Display Processed Dataframes

In [28]:
from IPython.display import display
# # display noun banks


for noun_bank in [all_usa_states, all_world_countries, all_elements, many_animals, many_greek_gods, many_roman_gods, many_cosmetic_items, many_office_supplies, many_olympic_sports, many_nfl_teams, many_marvel, many_mythical_creatures, many_weather_conditions, many_vegetables, many_professions, many_pets, many_landmarks, many_luxury_brands, many_holidays, many_gemstones, many_fruits, many_flowers_plants, many_empires, many_elements]:
    display(noun_bank)



Unnamed: 0,all_usa_states,area (sq. mi),pop_2014
0,Alabama,52423,4849377
1,Alaska,656425,736732
2,Arizona,114006,6731484
3,Arkansas,53182,2966369
4,California,163707,38802500
5,Colorado,104100,5355866
6,Connecticut,5544,3596677
7,Delaware,1954,935614
8,District of Columbia,68,658893
9,Florida,65758,19893297


Unnamed: 0,all_world_countries,area (sq. km),pop_2021
0,Aruba,180,106537
1,Africa Eastern and Southern,15162038.87,702976832
2,Afghanistan,652860,40099462
3,Africa Western and Central,9166260,478185907
4,Angola,1246700,34503774
...,...,...,...
258,Kosovo,10890,1786038
259,"Yemen, Rep.",527970,32981641
260,South Africa,1219090,59392255
261,Zambia,752610,19473125


Unnamed: 0,AtomicNumber,all_elements,Symbol,AtomicMass,NumberofNeutrons,NumberofProtons,NumberofElectrons,Period,Group,Phase,...,FirstIonization,Density,MeltingPoint,BoilingPoint,NumberOfIsotopes,Discoverer,Year,SpecificHeat,NumberofShells,NumberofValence
0,1,Hydrogen,H,1.007,0,1,1,1,1.0,gas,...,13.5984,0.000090,14.175,20.28,3.0,Cavendish,1766.0,14.304,1,1.0
1,2,Helium,He,4.002,2,2,2,1,18.0,gas,...,24.5874,0.000179,,4.22,5.0,Janssen,1868.0,5.193,1,
2,3,Lithium,Li,6.941,4,3,3,2,1.0,solid,...,5.3917,0.534000,453.850,1615.00,5.0,Arfvedson,1817.0,3.582,2,1.0
3,4,Beryllium,Be,9.012,5,4,4,2,2.0,solid,...,9.3227,1.850000,1560.150,2742.00,6.0,Vaulquelin,1798.0,1.825,2,2.0
4,5,Boron,B,10.811,6,5,5,2,13.0,solid,...,8.2980,2.340000,2573.150,4200.00,6.0,Gay-Lussac,1808.0,1.026,2,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,114,Flerovium,Fl,289.000,175,114,114,7,14.0,artificial,...,,,,,,,1999.0,,7,4.0
114,115,Moscovium,Mc,288.000,173,115,115,7,15.0,artificial,...,,,,,,,2010.0,,7,5.0
115,116,Livermorium,Lv,292.000,176,116,116,7,16.0,artificial,...,,,,,,,2000.0,,7,6.0
116,117,Tennessine,Ts,295.000,178,117,117,7,17.0,artificial,...,,,,,,,2010.0,,7,7.0


Unnamed: 0,many_animals
0,aardvark
1,aardwolf
2,african buffalo
3,african elephant
4,african leopard
...,...
468,xerinae
469,yak
470,yellow perch
471,zebra


Unnamed: 0,many_greek_gods
0,achelous
1,aeolus
2,aether
3,alastor
4,apollo
...,...
56,typhon
57,uranus
58,zelus
59,zephyrus


Unnamed: 0,many_roman_gods
0,abundantia
1,aesculapius
2,apollo
3,aurora
4,bacchus
...,...
66,vesta
67,victoria
68,volturnus
69,voluptas


Unnamed: 0,many_cosmetic_items
0,blush
1,bronzer
2,cc cream
3,chapstick
4,color correctors
5,concealer
6,contour
7,eyebrow pencil
8,eyelashes
9,eyeliner


Unnamed: 0,many_office_supplies
0,ballpoint pens
1,binder clips
2,binder dividers
3,binders
4,bookends
...,...
62,surge protector
63,tape
64,tape dispenser
65,thumbtacks


Unnamed: 0,many_olympic_sports
0,3x3 basketball
1,alpine skiing
2,archery
3,artistic gymnastics
4,artistic swimming
...,...
60,triathlon
61,volleyball
62,water polo
63,weightlifting


Unnamed: 0,many_nfl
0,arizona cardinals
1,atlanta falcons
2,baltimore ravens
3,buffalo bills
4,carolina panthers
5,chicago bears
6,cincinnati bengals
7,cleveland browns
8,dallas cowboys
9,denver broncos


Unnamed: 0,many_marvel
0,3-d man
1,a.i.m.
2,aaron stack
3,abomination (emil blonsky)
4,abomination (ultimate)
...,...
1163,zarda
1164,zemo
1165,zombie (simon garth)
1166,zuras


Unnamed: 0,many_mythical_creatures
0,abominable snowman
1,aigamuxa
2,amarok
3,amphisbaena
4,antmen
5,bigfoot
6,bonnacon
7,bunyip
8,caladrius
9,camazotz


Unnamed: 0,many_weather_conditions
0,acid rain
1,avalanche
2,black ice
3,blizzard
4,blood rain
...,...
77,wildfire
78,wind gust
79,windstorm
80,windy


Unnamed: 0,many_vegetables
0,aburana
1,acorn squash/pepper squash
2,ahipa
3,alfalfa sprouts
4,amaranth
...,...
327,yau chow
328,yellow squash
329,yow choy
330,yu choy sum


Unnamed: 0,many_professions
0,accountant
1,actor
2,actuary
3,adhesive bonding machine operator
4,adjuster
...,...
861,woodworking machine setter
862,word processor
863,writer
864,yardmaster


Unnamed: 0,many_pets
0,african greys
1,agamids
2,amazons
3,aviary birds
4,birds
...,...
72,tegus
73,tortoises
74,toucans
75,turtles


Unnamed: 0,many_landmarks
0,143
1,acropolis
2,alcatraz
3,alhambra
4,amalienborg palace
...,...
145,white cliffs of dover
146,willis tower
147,windsor castle
148,winter palace


Unnamed: 0,many_luxury_brands
0,a.p.c.
1,alexander mcqueen
2,alexander wang
3,am�lie pichard
4,ba&sh
...,...
95,valentino s.p.a.
96,vera wang
97,vivienne westwood
98,yohji yamamoto


Unnamed: 0,many_holidays
0,"birthday of martin luther king, jr."
1,christmas day
2,columbus day
3,independence day
4,labor day
5,memorial day
6,new year's day
7,thanksgiving day
8,veterans day
9,washington's birthday


Unnamed: 0,many_gemstones
0,achroiteê(var.)
1,actinolite
2,adamite
3,adulariaê(var.)
4,aegirine
...,...
461,zinnwaldite
462,zircon
463,zoisite
464,zultanite


Unnamed: 0,many_fruits
0,apple
1,apricot
2,avocado
3,banana
4,bilberry
...,...
88,tamarind
89,tangerine
90,ugli fruit
91,watermelon


Unnamed: 0,many_flowers_plants
0,african rice
1,african violet
2,alder
3,algerian oak quercus
4,almond
...,...
655,yellow milkweed
656,yellow ox eye daisy
657,yellow rocket
658,yellow wood


Unnamed: 0,many_empires
0,abbasid caliphate
1,achaemenid empire
2,akkadian empire
3,almohad caliphate
4,almoravid dynasty
...,...
145,western xiongnu
146,xia dynasty
147,xiongnu empire
148,yuan dynasty


Unnamed: 0,many_elements
0,"1,hydrogen,h,1.007,0,1,1,1,1,gas,,yes,,yes,,no..."
1,"10,neon,ne,20.18,10,10,10,2,18,gas,,yes,,yes,,..."
2,"100,fermium,fm,257,157,100,100,7,,artificial,y..."
3,"101,mendelevium,md,258,157,101,101,7,,artifici..."
4,"102,nobelium,no,259,157,102,102,7,,artificial,..."
...,...
114,"96,curium,cm,247,151,96,96,7,,artificial,yes,,..."
115,"97,berkelium,bk,247,150,97,97,7,,artificial,ye..."
116,"98,californium,cf,251,153,98,98,7,,artificial,..."
117,"99,einsteinium,es,252,153,99,99,7,,artificial,..."
