# Pre-Process Data
### Unprocessed CSV files are located in './nounbanks' directory.

In [20]:
import csv
import pandas as pd

In [32]:
def preprocess_one_col_csv(filename, col_name):
    '''Read a newline delimited CSV with one column and return a pandas dataframe.'''
    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter='\n') #read animals.csv with '\n' as the delimiter
        noun_list = list(reader)
    noun_list = [x[0] for x in noun_list] # make a list of the animals
    lowercase_noun_list = [x.lower() for x in noun_list] # lowercase
    lowercase_noun_list = list(dict.fromkeys(lowercase_noun_list)) # remove duplicates
    lowercase_noun_list = [x for x in lowercase_noun_list if x != ''] # remove empty strings
    lowercase_noun_list.sort() # sort alphabetically
    #convert to pandas dataframe
    df = pd.DataFrame(lowercase_noun_list, columns=[col_name])
    return df

def preprocess_states_csv(filename):
    '''Format USA_STATES.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'state' and 'area (sq. mi)'
    df = pd.read_csv(filename, names=['state', 'area (sq. mi)','pop_2014'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_countries_csv(filename):
    '''Format CPUNTRIES.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'country' and 'area (sq. km)'
    df = pd.read_csv(filename, names=['country', 'area (sq. km)','pop_2021'])
    df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

def preprocess_elements_csv(filename):
    '''Format ELEMENTS.csv and return a pandas dataframe.'''
    #read the csv file with pandas, the column names are 'element' and 'atomic number'
    df = pd.read_csv(filename)
    # df = df.iloc[1:].reset_index(drop=True) # remove the first row, which is the column names and reset the index
    return df

### Driver Code

In [33]:
usa_states = preprocess_states_csv('nounbanks/USA_STATES.csv')
all_countries = preprocess_countries_csv('nounbanks/COUNTRIES.csv')
all_elements = preprocess_elements_csv('nounbanks/ELEMENTS.csv')

all_animals = preprocess_one_col_csv('nounbanks/ANIMALS.csv', 'animal')
greek_gods = preprocess_one_col_csv('nounbanks/GREEKGODS.csv', 'greek_god')
roman_gods = preprocess_one_col_csv('nounbanks/ROMAN_GODS.csv', 'roman_god')
cosmetic_items = preprocess_one_col_csv('nounbanks/COSMETIC_ITEMS.csv', 'cosmetic_item')
office_supplies = preprocess_one_col_csv('nounbanks/OFFICE_SUPPLIES.csv', 'office_supply')

### Display Processed Dataframes

In [34]:
from IPython.display import display
display(all_animals)
display(usa_states)
display(all_countries)
display(all_elements)
display(greek_gods)
display(roman_gods)
display(cosmetic_items)
display(office_supplies)


Unnamed: 0,animal
0,aardvark
1,aardwolf
2,african buffalo
3,african elephant
4,african leopard
...,...
469,xerinae
470,yak
471,yellow perch
472,zebra


Unnamed: 0,state,area (sq. mi),pop_2014
0,Alabama,52423,4849377
1,Alaska,656425,736732
2,Arizona,114006,6731484
3,Arkansas,53182,2966369
4,California,163707,38802500
5,Colorado,104100,5355866
6,Connecticut,5544,3596677
7,Delaware,1954,935614
8,District of Columbia,68,658893
9,Florida,65758,19893297


Unnamed: 0,country,area (sq. km),pop_2021
0,Aruba,180,106537
1,Africa Eastern and Southern,15162038.87,702976832
2,Afghanistan,652860,40099462
3,Africa Western and Central,9166260,478185907
4,Angola,1246700,34503774
...,...,...,...
258,Kosovo,10890,1786038
259,"Yemen, Rep.",527970,32981641
260,South Africa,1219090,59392255
261,Zambia,752610,19473125


Unnamed: 0,AtomicNumber,Element,Symbol,AtomicMass,NumberofNeutrons,NumberofProtons,NumberofElectrons,Period,Group,Phase,...,FirstIonization,Density,MeltingPoint,BoilingPoint,NumberOfIsotopes,Discoverer,Year,SpecificHeat,NumberofShells,NumberofValence
0,1,Hydrogen,H,1.007,0,1,1,1,1.0,gas,...,13.5984,0.000090,14.175,20.28,3.0,Cavendish,1766.0,14.304,1,1.0
1,2,Helium,He,4.002,2,2,2,1,18.0,gas,...,24.5874,0.000179,,4.22,5.0,Janssen,1868.0,5.193,1,
2,3,Lithium,Li,6.941,4,3,3,2,1.0,solid,...,5.3917,0.534000,453.850,1615.00,5.0,Arfvedson,1817.0,3.582,2,1.0
3,4,Beryllium,Be,9.012,5,4,4,2,2.0,solid,...,9.3227,1.850000,1560.150,2742.00,6.0,Vaulquelin,1798.0,1.825,2,2.0
4,5,Boron,B,10.811,6,5,5,2,13.0,solid,...,8.2980,2.340000,2573.150,4200.00,6.0,Gay-Lussac,1808.0,1.026,2,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,114,Flerovium,Fl,289.000,175,114,114,7,14.0,artificial,...,,,,,,,1999.0,,7,4.0
114,115,Moscovium,Mc,288.000,173,115,115,7,15.0,artificial,...,,,,,,,2010.0,,7,5.0
115,116,Livermorium,Lv,292.000,176,116,116,7,16.0,artificial,...,,,,,,,2000.0,,7,6.0
116,117,Tennessine,Ts,295.000,178,117,117,7,17.0,artificial,...,,,,,,,2010.0,,7,7.0


Unnamed: 0,greek_god
0,achelous
1,aeolus
2,aether
3,alastor
4,apollo
...,...
56,typhon
57,uranus
58,zelus
59,zephyrus


Unnamed: 0,roman_god
0,abundantia
1,aesculapius
2,apollo
3,aurora
4,bacchus
...,...
66,vesta
67,victoria
68,volturnus
69,voluptas


Unnamed: 0,cosmetic_item
0,blush
1,bronzer
2,cc cream
3,chapstick
4,color correctors
5,concealer
6,contour
7,eyebrow pencil
8,eyelashes
9,eyeliner


Unnamed: 0,office_supply
0,ballpoint pens
1,binder clips
2,binder dividers
3,binders
4,bookends
...,...
62,surge protector
63,tape
64,tape dispenser
65,thumbtacks
