# Clean & Process COVID-19 Ethnicity Files for Pie Chart Symbology Maps
Center for Human Dynamics in the Mobile Age (HDMA) at San Diego State University

Jessica Embury

### MODULES

In [1]:
import pandas as pd
import os

### USER SET VARIABLE FOR FILE DATE
Use county's naming convention for file date

In [2]:
date = '28NOV2020'


### CREATE LIST OBJECT WITH ALL FILE NAMES

In [3]:
#file directory
path = '../covid_data/ethnicity/{}/'.format(date)

#file names
base = 'COVID19_DailyTotalRate_'

white_file = base + 'White_' + date + '.csv'
multi_file = base + 'MultipleRace_' + date + '.csv'
hispanic_file = base + 'Hispanic_' + date + '.csv'
black_file = base + 'Black_' + date + '.csv'
api_file = base + 'API_' + date + '.csv'
aian_file = base + 'AIAN_' + date + '.csv'

files = [aian_file, api_file, black_file, hispanic_file, white_file, multi_file]

### DATA FRAME WITH SUPPLEMENTAL SRA DATA, COORDINATES

In [5]:
#path
sra_in = './data/sra_info.csv'

#base sra data
sra = pd.read_csv(sra_in)
print(len(sra))
sra.head()

41


Unnamed: 0,sra_num,sra_name,latitude,longitude
0,1,Central San Diego,32.722644,-117.141073
1,2,Peninsula,32.742274,-117.216035
2,3,Coronado,32.657317,-117.143062
3,4,National City,32.665847,-117.099737
4,5,Southeastern San Diego,32.685705,-117.038621


### ADD A COLUMN FOR EACH ETHNICITY TO THE SRA DATAFRAME

In [6]:
#add ethnicity data as columns to the sra df
for f in range(len(files)):
    
    #not every week has had a file for each ethnicity, so check if file for ethnicity exists
    if os.path.exists(path + files[f]):
        
        #ethnicity column name
        ethnicity = files[f].split('_')[2]
        if ethnicity == 'MultipleRace':
            ethnicity = 'Multiple\nRace'
    
        #temp df for specific ethnicity
        df = pd.read_csv(path + files[f])

        #add date to sra df
        if f == 0:
            sra['Date'] = df['Date'][0]
    
        #add ethnicity columns to sra df
        #multiple column name options to catch naming variations in files from the county
        df = df.rename(columns = {'Allocated {}\nCase Count (Raw)'.format(ethnicity):'{}'.format(ethnicity.replace('\n',' ')),
                                  'Allocated {} Case Count (Raw)'.format(ethnicity.replace('\n',' ')):'{}'.format(ethnicity.replace('\n',' ')),
                                  'Allocated\n{} Case Count (Raw)'.format(ethnicity):'{}'.format(ethnicity.replace('\n',' ')),
                                  'Allocated\n{} Case\nCount\n(Raw)'.format(ethnicity):'{}'.format(ethnicity.replace('\n',' ')), 
                                  'Allocated\n{}\nCase\nCount\n(Raw)'.format(ethnicity):'{}'.format(ethnicity.replace('\n',' ')), 
                                  'Allocated\n{}\nRace\nCase\nCount\n(Raw)'.format(ethnicity):'{}'.format(ethnicity.replace('\n',' ')), 
                                  'Geography':'sra_name'})
        subset_df = df[['sra_name', ethnicity.replace('\n',' ')]].fillna(0)
    
        sra = sra.merge(subset_df, how='left', on='sra_name')

sra = sra.rename(columns = {'sra_num':'SRA', 
                            'sra_name':'Name', 
                            'latitude':'Latitude', 
                            'longitude':'Longitude', 
                            'AIAN':'American Indian, AK Native', 
                            'API':'Asian, Pacific Islander'})
print(len(sra))
sra.head()    
    

41


Unnamed: 0,SRA,Name,Latitude,Longitude,Date,"American Indian, AK Native","Asian, Pacific Islander",Black,Hispanic,White,Multiple Race
0,1,Central San Diego,32.722644,-117.141073,11/28/2020,8.5,179.2,278.3,2682.0,1316.0,61.5
1,2,Peninsula,32.742274,-117.216035,11/28/2020,0.0,25.4,27.8,203.1,502.6,23.0
2,3,Coronado,32.657317,-117.143062,11/28/2020,0.0,0.0,24.0,51.0,152.0,0.0
3,4,National City,32.665847,-117.099737,11/28/2020,0.0,170.3,55.6,1749.5,125.6,10.4
4,5,Southeastern San Diego,32.685705,-117.038621,11/28/2020,12.7,486.4,345.5,3595.3,390.7,56.9


In [7]:
#alphabetize rows by sra name
sra = sra.sort_values(by='Name', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
sra.head()

Unnamed: 0,SRA,Name,Latitude,Longitude,Date,"American Indian, AK Native","Asian, Pacific Islander",Black,Hispanic,White,Multiple Race
25,38,Alpine,32.837524,-116.758443,11/28/2020,0.0,5.5,0.0,55.5,147.7,0.0
40,63,Anza-Borrego Springs,33.114514,-116.289908,11/28/2020,0.0,0.0,0.0,9.3,9.9,0.0
28,41,Carlsbad,33.133083,-117.284496,11/28/2020,5.0,60.0,20.2,424.3,732.7,18.4
0,1,Central San Diego,32.722644,-117.141073,11/28/2020,8.5,179.2,278.3,2682.0,1316.0,61.5
15,21,Chula Vista,32.622386,-117.077827,11/28/2020,0.0,174.3,65.2,3992.2,307.2,37.7


### EXPORT CSV FILE

In [10]:
# save sra df as csv
date2 = sra['Date'][0]
sra.to_csv('./data/covid_ethnicity_data_{}.csv'.format(date2.replace('/','')), index=False)
sra.to_csv('C:/Users/jesse/Dropbox/Mapping-Vulearable-Pop-Tasks/Ethnicity-Maps/covid_ethnicity_data_{}.csv'.format(date2.replace('/','')), index=False)