# Format Ethnicity Data from [SANDAG Data Surfer](https://datasurfer.sandag.org/)
Center for Human Dynamics in the Mobile Age (HDMA) at San Diego State University

Jessica Embury

### MODULES

In [None]:
import pandas as pd

### FUNCTION TO CLEAN & EXPORT DATA SURFER FILES
Supports all demographic categories (Ethnicity, Housing, Income, etc.)

In [1]:
def fix_data_surfer_file (path_in, year, category, option, numbers_column, path_out):
    '''
    Format San Diego Data Surfer files for use in ArcGIS Pro, export formatted CSV file.
    '''
    
    #create dataframe from unformatted data
    raw_df = pd.read_csv(path_in)
    
    #subset data for desired year only
    year_df = raw_df.query('YEAR == {}'.format(year))
    print(year_df.head())
    
    #get data frame with list of all SRAs, eliminate duplicates
    sra_df = year_df[['SRA']]
    print(sra_df.head())
    
    #sra_df.drop_duplicates(keep = False) - didn't work in testing
    sra_df['duplicate'] = sra_df['SRA'].duplicated()
    
    df = sra_df.query('duplicate == False')
    del df['duplicate']
    df = df.reset_index(drop = True)
    print(df.head())
    
    #add each category option as a column in the dataframe
    for i in range(len(option)):
        
        temp_df = year_df.query('{} == "{}"'.format(category, option[i]))
        print(temp_df.head())
        temp_df = temp_df.rename(columns = ({numbers_column : option[i]}))
        print(temp_df.head())
        del temp_df['YEAR']
        print(temp_df.head())
        del temp_df[category]
        print(temp_df.head())
        
        if (i == 0):
            print(temp_df.head())
            df = pd.merge(df, temp_df, on = df.SRA, how = 'outer')
            print('i==0')
        else: 
            df = df.rename(columns = {'key_0':'SRA'})
            df = pd.merge(df, temp_df, on = df.SRA, how = 'inner')
        
            
        del df['SRA_x']
        del df['SRA_y']
        
        df[option[i]] = pd.to_numeric(df[option[i]])
        
    df = df.rename(columns = {'key_0':'SRA'}) 
        
    #get sum of all categories
    df['total'] = df.sum(axis=1)
        
    df.to_csv(path_out)            
        

### PARAMETERS

In [4]:
#Ethnicity data parameter
path_in = 'datasurfer_ethnicity_sra_2019.csv'
year = '2019'
category = 'ETHNICITY'
option = ['American Indian', 'Asian', 'Black', 'Hispanic', 'Other', 'Pacific Islander', 'Two or More', 'White']
numbers_column = 'POPULATION'
path_out = 'ethnicity_sra_2019.csv'

### FUNCTION CALL

In [5]:
#fix ethnicity data
fix_data_surfer_file (path_in, year, category, option, numbers_column, path_out)

       SRA  YEAR        ETHNICITY  POPULATION
72  Alpine  2019  American Indian         131
73  Alpine  2019            Asian         668
74  Alpine  2019            Black         238
75  Alpine  2019         Hispanic        2377
76  Alpine  2019            Other          39
       SRA
72  Alpine
73  Alpine
74  Alpine
75  Alpine
76  Alpine
                    SRA
0                Alpine
1  Anza-Borrego Springs
2              Carlsbad
3     Central San Diego
4           Chula Vista
                      SRA  YEAR        ETHNICITY  POPULATION
72                 Alpine  2019  American Indian         131
152  Anza-Borrego Springs  2019  American Indian          24
232              Carlsbad  2019  American Indian         672
312     Central San Diego  2019  American Indian        1272
392           Chula Vista  2019  American Indian         559
                      SRA  YEAR        ETHNICITY  American Indian
72                 Alpine  2019  American Indian              131
152  Anza-Borreg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
