# Pre-Process Data
## Unprocessed CSV files located in the './nounbanks' directory.

In [1]:
import csv
import pandas as pd

In [3]:
def preprocess_one_col_csv(filename, col_name):
    '''Read a newline delimited CSV with one column and return a pandas dataframe.'''
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter='\n') #read animals.csv with '\n' as the delimiter
        noun_list = list(reader)
    noun_list = [x[0] for x in noun_list] # make a list of the animals
    lowercase_noun_list = [x.lower() for x in noun_list] # lowercase
    lowercase_noun_list = list(dict.fromkeys(lowercase_noun_list)) # remove duplicates
    lowercase_noun_list = [x for x in lowercase_noun_list if x != ''] # remove empty strings
    lowercase_noun_list.sort() # saort alphabetically
    #convert to pandas dataframe
    df = pd.DataFrame(lowercase_noun_list, columns=[col_name])
    return df

def preprocess_states_csv(filename):
    '''Format USA_State.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'state' and 'area (sq. mi)'
    df = pd.read_csv(filename, names=['all_usa_states', 'area (sq. mi)','pop_2014'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_countries_csv(filename):
    '''Format CPUNTRIES.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'country' and 'area (sq. km)'
    df = pd.read_csv(filename, names=['all_world_countries', 'area (sq. km)','pop_2021'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_elements_csv(filename):
    '''Format ELEMENTS.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'element' and 'atomic number'
    df = pd.read_csv(filename)
    # rename element column to 'all_elements'
    df = df.rename(columns={'Element': 'all_elements'})
    # df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

### Driver

In [6]:
many_capitals = preprocess_one_col_csv('nounbanks/CAPITALS.csv', 'many_capitals')
many_us_presidents = preprocess_one_col_csv('nounbanks/US_PRESIDENTS_LASTNAME.csv', 'many_us_presidents')
many_english_words = preprocess_one_col_csv('nounbanks/ALL_WORDS.csv', 'many_english_words')
all_usa_states = preprocess_one_col_csv('nounbanks/US_STATES.csv', 'all_usa_states')
all_world_countries = preprocess_one_col_csv('nounbanks/COUNTRIES.csv', 'all_world_countries')
all_elements = preprocess_one_col_csv('nounbanks/ELEMENTS.csv', 'all_elements')
many_car_manufacturers = preprocess_one_col_csv('nounbanks/CAR_MANUFACTURERS.csv', 'many_car_manufacturers')
many_cosmetic_items = preprocess_one_col_csv('nounbanks/COSMETIC_ITEMS.csv', 'many_cosmetic_items')
many_fruits = preprocess_one_col_csv('nounbanks/FRUITS.csv', 'many_fruits')
many_luxury_brands = preprocess_one_col_csv('nounbanks/LUXURY_BRANDS.csv', 'many_luxury_brands')
many_olympic_sports = preprocess_one_col_csv('nounbanks/OLYMPIC_SPORTS.csv', 'many_olympic_sports')
many_greek_gods = preprocess_one_col_csv('nounbanks/GREEK_GODS.csv', 'many_greek_gods')



# use '%store' magic method to save variables to the notebook's namespace
%store many_capitals
%store many_us_presidents
%store many_english_words
%store all_usa_states
%store all_world_countries
%store all_elements
%store many_car_manufacturers
%store many_cosmetic_items
%store many_fruits
%store many_luxury_brands
%store many_olympic_sports
%store many_greek_gods


Stored 'many_capitals' (DataFrame)
Stored 'many_us_presidents' (DataFrame)
Stored 'many_english_words' (DataFrame)
Stored 'all_usa_states' (DataFrame)
Stored 'all_world_countries' (DataFrame)
Stored 'all_elements' (DataFrame)
Stored 'many_car_manufacturers' (DataFrame)
Stored 'many_cosmetic_items' (DataFrame)
Stored 'many_fruits' (DataFrame)
Stored 'many_luxury_brands' (DataFrame)
Stored 'many_olympic_sports' (DataFrame)
Stored 'many_greek_gods' (DataFrame)


### Display Processed Dataframes

In [8]:
from IPython.display import display
# # display noun banks


for noun_bank in [many_capitals, many_us_presidents, many_english_words, all_usa_states, all_world_countries, all_elements, many_car_manufacturers, many_cosmetic_items, many_fruits, many_luxury_brands, many_olympic_sports, many_greek_gods]:
    display(noun_bank)
    


Unnamed: 0,many_capitals
0,abu dhabi
1,abuja
2,accra
3,addis ababa
4,algiers
...,...
196,windhoek
197,yamoussoukro
198,yaounde
199,yerevan


Unnamed: 0,many_us_presidents
0,adams
1,arthur
2,biden
3,buchanan
4,bush
5,carter
6,cleveland
7,clinton
8,coolidge
9,eisenhower


Unnamed: 0,many_english_words
0,aardvark
1,aardwolf
2,aaron
3,aback
4,abacus
...,...
58104,zooms
58105,zooplankton
58106,zoos
58107,zulu


Unnamed: 0,all_usa_states
0,alabama
1,alaska
2,arizona
3,arkansas
4,california
5,colorado
6,connecticut
7,delaware
8,florida
9,georgia


Unnamed: 0,all_world_countries
0,afghanistan
1,albania
2,algeria
3,andorra
4,angola
...,...
192,venezuela
193,vietnam
194,yemen
195,zambia


Unnamed: 0,all_elements
0,actinium
1,aluminum
2,americium
3,antimony
4,argon
...,...
113,xenon
114,ytterbium
115,yttrium
116,zinc


Unnamed: 0,many_car_manufacturers
0,abarth
1,acura
2,aixam
3,alfa romeo
4,alpine
...,...
132,volkswagen
133,volvo
134,w motors
135,wiesmann


Unnamed: 0,many_cosmetic_items
0,blush
1,bronzer
2,cc cream
3,chapstick
4,color corrector
5,concealer
6,contour
7,eyebrow pencil
8,eyelashes
9,eyeliner


Unnamed: 0,many_fruits
0,acai
1,ackee
2,apple
3,apricot
4,avocado
...,...
100,tangerine
101,tomato
102,ugli fruit
103,watermelon


Unnamed: 0,many_luxury_brands
0,alexander mcqueen
1,alexander wang
2,amelie pichard
3,apc
4,armani
...,...
94,versace
95,vivienne westwood
96,yohji yamamoto
97,yves saint laurent


Unnamed: 0,many_olympic_sports
0,alpine skiing
1,archery
2,artistic gymnastics
3,artistic swimming
4,athletics
...,...
59,triathlon
60,volleyball
61,water polo
62,weightlifting


Unnamed: 0,many_greek_gods
0,aceso
1,achelous
2,acheron
3,achlys
4,achos
...,...
439,uranus
440,zagreus
441,zelos
442,zephyrus
