# Census Data Cleanup

In [1]:
# import dependencies
import pandas as pd

## Race by State (2013 - 2016)

In [2]:
# store datasets into a list of Pandas dataframes, one per year of census data
years = ['13', '14', '15', '16']
race_dfs = []

# for each year
for year in years:
    
    # drop annotation and Puerto Rico rows
    df = pd.read_csv(f'data/ACS_{year}_1YR_B02001_with_ann.csv').drop([0, 52])
    
    # append dataframe to list
    race_dfs.append(df)

In [3]:
# for each dataframe
for df in race_dfs:
    
    # drop unused columns (e.g., margin of error)
    df.drop(columns = ['GEO.id', 'GEO.id2', 'HD02_VD01', 'HD02_VD02', 'HD02_VD03', 'HD02_VD04',
                       'HD02_VD05', 'HD02_VD06', 'HD02_VD07', 'HD02_VD08', 'HD01_VD09', 'HD02_VD09',
                       'HD01_VD10', 'HD02_VD10'], inplace = True)

In [4]:
# for each dataframe
for df in race_dfs:
    
    # rename columns
    df.columns = ['State', 'Total', 'White', 'African American', 'Native American',
                  'Asian', 'Pacific Islander', 'Other', 'Two or more races']

# confirm successful cleanup
race_dfs[0].head()

Unnamed: 0,State,Total,White,African American,Native American,Asian,Pacific Islander,Other,Two or more races
1,Alabama,4833722,3330478,1284102,22459,58624,1521,54784,81754
2,Alaska,735132,487762,25077,105310,41775,8794,9508,56906
3,Arizona,6626624,5233466,277973,288294,191718,12609,410756,211808
4,Arkansas,2959373,2305726,463928,17704,39210,628,66953,65224
5,California,38332521,23741019,2269021,278377,5210236,142782,4961376,1729710


In [5]:
# list of races in census data
races = race_dfs[0].columns.tolist()[-7:]

# for each dataframe
for df in race_dfs:
    
    # for each race
    for race in races:
        
        # calculate and add percentages
        df[f'% {race}'] = df[race].astype(int) / df['Total'].astype(int)

# confirm successful cleanup
race_dfs[0].head()

Unnamed: 0,State,Total,White,African American,Native American,Asian,Pacific Islander,Other,Two or more races,% White,% African American,% Native American,% Asian,% Pacific Islander,% Other,% Two or more races
1,Alabama,4833722,3330478,1284102,22459,58624,1521,54784,81754,0.689009,0.265655,0.004646,0.012128,0.000315,0.011334,0.016913
2,Alaska,735132,487762,25077,105310,41775,8794,9508,56906,0.663503,0.034112,0.143253,0.056827,0.011962,0.012934,0.077409
3,Arizona,6626624,5233466,277973,288294,191718,12609,410756,211808,0.789764,0.041948,0.043505,0.028931,0.001903,0.061986,0.031963
4,Arkansas,2959373,2305726,463928,17704,39210,628,66953,65224,0.779127,0.156766,0.005982,0.013249,0.000212,0.022624,0.02204
5,California,38332521,23741019,2269021,278377,5210236,142782,4961376,1729710,0.619344,0.059193,0.007262,0.135922,0.003725,0.12943,0.045124


In [6]:
# year index counter
year_index = 0

# for each dataframe
for df in race_dfs:
        
    df.to_csv(f'data_cleaned/race_by_state_20{years[year_index]}.csv')
    
    year_index += 1