In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def clean_commute(path):
    ### read in data
    data = pd.read_csv(path)
    
    ### create name of file for saving
    year = int(path.split('_data/')[1].split('_Data')[0])
    file = str(year) + '_commute'
    
               
    ### get rid of specific columns
    #print('Get rid of specific columns')
    pattern1 = r'Margin of Error'
    pattern2 = r'PERCENT ALLOCATED'
    pattern3 = r'Unnamed:'

    drop = []
    
    for col in data.columns:
        if (re.search(pattern1,col) or re.search(pattern2,col) or re.search(pattern3,col)):
            drop.append(col)
    #end
    
    data = data.drop(columns = drop)
    
    ### clean up columns names
    #print('Clean up columns names')
    new_cols = []

    for col in data.columns:
        new_col = col.replace('Estimate','').replace('Total', '').replace('!!',' ').replace('  ',' ').strip()
        new_cols.append(new_col)
    #end
    
    data.columns = new_cols
    
    
    ### ALSO CHECK IF THIS COLUMN EXISTS!
    #print('Determine j')
    
    if year > 2018:
        j = 1 # this indicates that my original code will work with this data
        #print('j is',j)

    elif year == 2018:
        j = 2
        #print('j is',j)

    else:
        j = 0
        #print('j is',j)

    
    ### keep specific commute related columns
    #print('Keep specific commute related columns')
    pattern1 = r'TRAVEL TIME'
    
    if j%2 == 0:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work at home',
                'Car, truck, or van -- drove alone Workers 16 years and over','Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
    else:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work from home',
                'Car, truck, or van -- drove alone Workers 16 years and over','Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
    
    for col in data.columns:
        if re.search(pattern1,col):
            keep.append(col)
    #end
    
    data = data[keep]

    
    ### Set county as index
    #print('Set county as index')

    data = data.set_index('Geographic Area Name')
    
    
    ### replace N and - with NaN
    #print('Replace N and - with NaN')
    data = data.replace(to_replace='N',value=np.nan).replace(to_replace = '-', value = np.nan).replace('**',np.nan)
    
    data = data.dropna(thresh = 25)
    
    
    ### make dtypes numeric
    #print('Make df numeric')
    data = data.apply(pd.to_numeric)
    
    
    ### get rid of percentages by translating them to integers BASED ON j
    if j == 0:
        for row in data.itertuples():
            #print('\tROW IS NOW:\t',row.Index)
            comm_total  = row[2]
            drove_alone = row[3]
            carpool     = row[4]
            pub_transit = row[5]

            for i in range(6,42):
                #print('i is',i)
                if i%4 == 2: # total
                    try:
                        #print('--> COMM_TOTAL\nCOMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(comm_total*(row[i]/100))
                        #print('CHECK:',comm_total,'times',row[i]/100,'equals',new_val)
                        
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                if i%4 == 3: # drove alone
                    try:
                        #print('--> DROVE_ALONE\nCOMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(drove_alone*(row[i]/100))
                        #print('CHECK:',drove_alone,'times',row[i]/100,'equals',new_val)
                        
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                if i%4 == 0: # carpool
                    try:
                        #print('--> CARPOOL\nCOMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(carpool*(row[i]/100))
                        #print('CHECK:',carpool,'times',row[i]/100,'equals',new_val)
                        
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                if i%4 == 1: # public transit
                    try:
                        #print('--> PUB_TRANSIT\nCOMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(pub_transit*(row[i]/100))
                        #print('CHECK:',pub_transit,'times',row[i]/100,'equals',new_val)
                        
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
            #end
    elif j == 1:
        for row in data.itertuples():
            #print('ROW IS:',row.Index)
            comm_total  = row[2]
            drove_alone = row[3]
            carpool     = row[4]
            pub_transit = row[5]

            for i in range(6,15):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(comm_total*(row[i]/100))
                    #print('CHECK:',comm_total,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan


            for i in range(16,25):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(drove_alone*(row[i]/100))
                    #print('CHECK:',drove_alone,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan


            for i in range(26,35):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(carpool*(row[i]/100))
                    #print('CHECK:',carpool,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan


            for i in range(36,45):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(pub_transit*(row[i]/100))
                    #print('CHECK:',pub_transit,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan
        #end
    
    else:
        for row in data.itertuples():
            #print('ROW IS:',row.Index)
            comm_total  = row[2]
            drove_alone = row[3]
            carpool     = row[4]
            pub_transit = row[5]

            for i in range(6,15):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(comm_total*(row[i]/100))
                    #print('CHECK:',comm_total,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan

            i = 16
            while i < 43:
                #print('i is',i)
                
                if i%3 == 1: # drove alone
                    try:
                        #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(drove_alone*(row[i]/100))
                        #print('CHECK:',drove_alone,'times',row[i]/100,'equals',new_val)

                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan


                if i%3 == 2: # carpool
                    try:
                        #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(carpool*(row[i]/100))
                        #print('CHECK:',carpool,'times',row[i]/100,'equals',new_val)

                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan


                if i%3 == 0: # public transit
                    try:
                        #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(pub_transit*(row[i]/100))
                        #print('CHECK:',pub_transit,'times',row[i]/100,'equals',new_val)

                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                #end
                
                i += 1
        
    
    ### Clean up column names some more
    pattern1 = r'Workers 16 years and over who did not work from home'
    pattern0 = r'Workers 16 years and over who did not work at home'
    pattern2 = r'Estimate'
    pattern3 = r'(excluding taxicab)'

    new_cols = {}

    for col in data.columns:
        if (col != 'Workers 16 years and over who did not work from home' and
            col != 'Workers 16 years and over who did not work at home') and (re.search(pattern0,col) 
                                                                              or re.search(pattern1,col) 
                                                                              or re.search(pattern2,col) 
                                                                              or re.search(pattern3,col)):
            new_col = col.replace(str(pattern0),'').replace(str(pattern1),'').replace(str(pattern2),'').replace(str(pattern3),'').strip()
            new_cols[col] = new_col
    #end
    
    data = data.rename(columns = new_cols)
    
    ### save county file
    data.to_csv(f'../data/unfiltered_data/{file}_county.csv')
    
    return data

In [3]:
clean_commute('../data/census_data/2010_Data.csv')

Unnamed: 0_level_0,Workers 16 years and over,Workers 16 years and over who did not work at home,"Car, truck, or van -- drove alone Workers 16 years and over","Car, truck, or van -- carpooled Workers 16 years and over",Public transportation Workers 16 years and over,TRAVEL TIME TO WORK Less than 10 minutes,"Car, truck, or van -- drove alone TRAVEL TIME TO WORK Less than 10 minutes","Car, truck, or van -- carpooled TRAVEL TIME TO WORK Less than 10 minutes",Public transportation TRAVEL TIME TO WORK Less than 10 minutes,TRAVEL TIME TO WORK 10 to 14 minutes,...,"Car, truck, or van -- carpooled TRAVEL TIME TO WORK 45 to 59 minutes",Public transportation TRAVEL TIME TO WORK 45 to 59 minutes,TRAVEL TIME TO WORK 60 or more minutes,"Car, truck, or van -- drove alone TRAVEL TIME TO WORK 60 or more minutes","Car, truck, or van -- carpooled TRAVEL TIME TO WORK 60 or more minutes",Public transportation TRAVEL TIME TO WORK 60 or more minutes,TRAVEL TIME TO WORK Mean travel time to work (minutes),"Car, truck, or van -- drove alone TRAVEL TIME TO WORK Mean travel time to work (minutes)","Car, truck, or van -- carpooled TRAVEL TIME TO WORK Mean travel time to work (minutes)",Public transportation TRAVEL TIME TO WORK Mean travel time to work (minutes)
Geographic Area Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Calhoun County, Alabama",43874,,38242,4003,136,,5545.0,340.0,0.0,,...,196.0,0.0,,1453.0,384.0,0.0,21.4,21.3,22.7,
"Houston County, Alabama",50739,49736.0,45459,3365,113,7311.0,6818.0,299.0,0.0,10742.0,...,255.0,0.0,1293.0,1045.0,201.0,25.0,20.3,19.9,25.6,47.8
"Jefferson County, Alabama",350866,342821.0,298194,36354,2573,24683.0,19978.0,2835.0,0.0,39767.0,...,3526.0,421.0,25025.0,19382.0,4253.0,838.0,27.0,26.7,29.7,44.5
"Lee County, Alabama",53658,52066.0,40828,7187,478,10204.0,7430.0,1157.0,0.0,10152.0,...,675.0,0.0,2447.0,1469.0,884.0,0.0,20.9,20.0,29.9,15.5
"Madison County, Alabama",196561,190460.0,167379,17905,776,18093.0,16068.0,1593.0,45.0,22283.0,...,1683.0,138.0,9142.0,7197.0,1575.0,225.0,24.1,23.9,27.2,39.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Caguas Municipio, Puerto Rico",51886,51005.0,39557,6781,1032,3825.0,2096.0,501.0,0.0,3978.0,...,712.0,158.0,5967.0,4034.0,1512.0,246.0,28.7,28.6,34.2,36.7
"Carolina Municipio, Puerto Rico",61588,58942.0,48047,6498,2661,6306.0,4756.0,890.0,74.0,6837.0,...,500.0,114.0,8782.0,6630.0,896.0,1061.0,29.8,29.4,28.8,48.7
"Guaynabo Municipio, Puerto Rico",58744,57427.0,46345,6515,1587,3962.0,2548.0,0.0,0.0,4479.0,...,912.0,192.0,12404.0,9732.0,1583.0,863.0,35.4,35.4,37.8,64.6
"Ponce Municipio, Puerto Rico",60858,59582.0,50197,5790,661,6613.0,5371.0,613.0,0.0,7149.0,...,243.0,37.0,4587.0,3714.0,700.0,40.0,24.6,24.9,25.9,27.2


In [None]:
for i in range (2010,2023):
    if i != 2020:
        print(i)
        
        try:
            path = f'../data/census_data/{i}_Data.csv'
            clean_commute(path)
            print('Completed')
        except:
            print('Skipped')
            continue
#end

In [None]:
def clean_age(path):
    ### read in data
    data = pd.read_csv(path)
    
    
    ### create name of file for saving
    year = int(path.split('_data/')[1].split('_Data')[0])
    file = str(year) + f'_age'
    
    ### get rid of specific columns
    #print('Get rid of specific columns')
    pattern1 = r'Margin of Error'
    pattern2 = r'PERCENT ALLOCATED'
    pattern3 = r'Unnamed:'

    drop = []
    
    for col in data.columns:
        if (re.search(pattern1,col) or re.search(pattern2,col) or re.search(pattern3,col)):
            drop.append(col)
    #end
    
    data = data.drop(columns = drop)
    
    ### clean up columns names
    #print('Clean up columns names')
    new_cols = []

    for col in data.columns:
        new_col = col.replace('Estimate','').replace('Total', '').replace('!!',' ').replace('  ',' ').strip()
        new_cols.append(new_col)
    #end
    
    data.columns = new_cols
    
    
    ### replace N and - with NaN
    #print('Replace N and - with NaN')
    data = data.replace(to_replace='N',value=np.nan).replace(to_replace = '-', value = np.nan).replace('**',np.nan)
    
    
    ### ALSO CHECK IF THIS COLUMN EXISTS!
    #print('Determine j')
    
    if year > 2018:
        j = 1 # this indicates that my original code will work with this data
        #print('j is',j)
        
    elif year == 2018:
        j = 2
        #print('j is',j)
    
    else:
        j = 0
        #print('j is',j)
    
    
    ### keep specific commute related columns
    #print('Keep specific commute related columns')

    if j%2 == 0: # that is, j = 0 or j = 2
        pattern1 = r'AGE.\d'
        pattern2 = r'Median age'
    else:
        pattern1 = r'over.AGE'
        pattern2 = r'PLACEHOLDER TEXT' # There wasn't any need for a second pattern but I needed the code to be consistent
    
    if j%2 == 0:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work at home',
                'Car, truck, or van -- drove alone Workers 16 years and over', 'Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
    else:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work from home',
                'Car, truck, or van -- drove alone Workers 16 years and over','Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
        
    for col in data.columns:
        if re.search(pattern1,col) or re.search(pattern2,col):
            keep.append(col)
    #end
    
    data = data[keep]

    
    ### Set county as index
    data = data.set_index('Geographic Area Name')
    
    
    ### make dtypes numeric
    data = data.apply(pd.to_numeric)
    


    ### get rid of percentages by translating them to integers BASED ON j
    if j == 0:
        for row in data.itertuples():
            comm_total  = row[2]
            drove_alone = row[3]
            carpool     = row[4]
            pub_transit = row[5]

            for i in range(6,30):
                if i%4 == 2: # total
                    try:
                        new_val = int(comm_total*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                elif i%4 == 3: # drove alone
                    try:
                        new_val = int(drove_alone*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                        #end
                elif i%4 == 0: # carpool
                    try:
                        new_val = int(carpool*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                elif i%4 == 1: # public transit
                    try:
                        new_val = int(pub_transit*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                    
    elif j == 1:
        for row in data.itertuples():
            comm_total  = row[2]
            drove_alone = row[3]
            carpool     = row[4]
            pub_transit = row[5]
            
            for i in range(6,33):                
                if (i <= 11): # total
                    try:
                        new_val = int(comm_total*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                    
                elif ((i >= 13) and (i <= 18)): # drove alone
                    try:
                        new_val = int(drove_alone*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                        #end
                        
                elif ((i >= 20) and (i <= 25)): # carpool
                    try:
                        new_val = int(carpool*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                    
                elif ((i >= 27) and (i <= 32)): # public transit
                    try:
                        new_val = int(pub_transit*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val

                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
    else:
        for row in data.itertuples():
            comm_total  = row[2]
            drove_alone = row[3]
            carpool     = row[4]
            pub_transit = row[5]
            
            for i in range(6,31):                
                if (i < 12): # total
                    try:
                        new_val = int(comm_total*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                    
                elif i == 12:
                    continue
                    
                elif i%3 == 1: # drove alone
                    try:
                        new_val = int(drove_alone*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                        #end
                        
                elif i%3 == 2: # carpool
                    try:
                        new_val = int(carpool*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                    
                elif i%3 == 0: # public transit
                    try:
                        new_val = int(pub_transit*(row[i]/100))
                        data.loc[row.Index, data.columns[i-1]] = new_val

                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
        #end
    
    
    ### Clean up column names some more
    pattern0 = r'Workers 16 years and over who did not work at home'
    pattern1 = r'Workers 16 years and over who did not work from home'
    pattern2 = r'Estimate'
    pattern3 = r'(excluding taxicab)'

    new_cols = {}

    for col in data.columns:
        if (col != 'Workers 16 years and over who did not work from home' 
            and col != 'Workers 16 years and over who did not work from home') and (re.search(pattern1,col) 
                                                                                 or re.search(pattern2,col) 
                                                                                 or re.search(pattern3,col)):
            new_col = col.replace(str(pattern1),'').replace(str(pattern2),'').replace(str(pattern3),'').strip()
            new_cols[col] = new_col
    #end
    
    data = data.rename(columns = new_cols)
    
    ### save county file
    data.to_csv(f'../data/unfiltered_data/{file}_county.csv')
    
    return data

In [None]:
for i in range (2010,2023):
    if i != 2020:
        print(i)
        
        try:
            path = f'../data/census_data/{i}_Data.csv'
            clean_age(path)
            print('Completed')
        except:
            print('Skipped')
            continue
#end

In [6]:
def configure(path):
    df = pd.read_csv(path)
    year = int(path.split('_data/')[1].split('_')[0])
    csv_type = path.split(str(year)[1].split('_county')[0])
    
    if csv_type == 'commute':
        csv_type = 'comm'
    
    
    ### set county as index
    df = df.set_index('Geographic Area Name')
    
    
    ### drop total column since we're only interested in people who travel
    df = df.drop(columns = 'Workers 16 years and over')
        
    
    ### change column names
    new_cols = []

    for col in df.columns:
        if col == 'Workers 16 years and over who did not work from home' or col == 'Workers 16 years and over who did not work at home':
            #print('FOUND')
            new_col = 'workers'
            new_cols.append(new_col)
        
        else:
            # The great replacening
            new_col = str.lower(col)
            new_col = new_col.replace(' 16 years and over','').replace('car, truck, or van -- ','').strip()
            new_col = new_col.replace('travel time to work ','').replace(' to ','_').replace('less than ','').strip()
            new_col = new_col.replace('-work (minutes)','').replace(' or more','').strip()
            new_col = new_col.replace('public transportation','pub_transit').replace(' workers','').replace('minutes','min').strip()
            new_col = new_col.replace('(years)','').replace(' years','').replace('  ',' ').strip().replace(' ','_')
            new_col = new_col.replace('__','_').replace('_who_did_not_work_at_home','').replace('_who_did_not_work_from_home','')
            if year >= 2018:
                new_col = new_col.replace('age_median','median')
            #end
            new_col = new_col.replace('(min)','min')
            new_col = re.sub(r'workers_(\d)',r'\1',new_col)
            new_col = new_col.replace('workers_mean','mean').replace('workers_age','age').replace('workers_median','median')
            new_cols.append(new_col)
        #end
    
    df.columns = new_cols
    
    
    ### change index name
    df.index.name = 'county_name'
    
    
    ### add percentage columns
    df['alone_pct']   = round( (df['drove_alone'] / df['workers']) * 100, 2)
    df['carpool_pct'] = round( (df['carpooled'] / df['workers']) * 100, 2)
    df['transit_pct'] = round( (df['pub_transit'] / df['workers']) * 100, 2)
    
    
    ### add state and year columns
    states = []
    counties = []
    
    for county_name in df.index:
        split = county_name.split(', ')
        state = split[1]
        county = split[0]
        states.append(state)
        counties.append(county)
    #end
    
    df['state'] = states
    df['county'] = counties
    df['year'] = year
    
    return df

In [7]:
configure('../data/unfiltered_data/2010_commute_county.csv')

Unnamed: 0_level_0,workers,drove_alone,carpooled,pub_transit,10_min,drove_alone_10_min,carpooled_10_min,pub_transit_10_min,10_14_min,drove_alone_10_14_min,...,mean_min,drove_alone_mean_min,carpooled_mean_min,pub_transit_mean_min,alone_pct,carpool_pct,transit_pct,state,county,year
county_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Calhoun County, Alabama",,38242,4003,136,,5545.0,340.0,0.0,,7151.0,...,21.4,21.3,22.7,,,,,Alabama,Calhoun County,2010
"Houston County, Alabama",49736.0,45459,3365,113,7311.0,6818.0,299.0,0.0,10742.0,9773.0,...,20.3,19.9,25.6,47.8,91.40,6.77,0.23,Alabama,Houston County,2010
"Jefferson County, Alabama",342821.0,298194,36354,2573,24683.0,19978.0,2835.0,0.0,39767.0,34888.0,...,27.0,26.7,29.7,44.5,86.98,10.60,0.75,Alabama,Jefferson County,2010
"Lee County, Alabama",52066.0,40828,7187,478,10204.0,7430.0,1157.0,0.0,10152.0,8247.0,...,20.9,20.0,29.9,15.5,78.42,13.80,0.92,Alabama,Lee County,2010
"Madison County, Alabama",190460.0,167379,17905,776,18093.0,16068.0,1593.0,45.0,22283.0,19248.0,...,24.1,23.9,27.2,39.2,87.88,9.40,0.41,Alabama,Madison County,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Caguas Municipio, Puerto Rico",51005.0,39557,6781,1032,3825.0,2096.0,501.0,0.0,3978.0,2729.0,...,28.7,28.6,34.2,36.7,77.56,13.29,2.02,Puerto Rico,Caguas Municipio,2010
"Carolina Municipio, Puerto Rico",58942.0,48047,6498,2661,6306.0,4756.0,890.0,74.0,6837.0,5525.0,...,29.8,29.4,28.8,48.7,81.52,11.02,4.51,Puerto Rico,Carolina Municipio,2010
"Guaynabo Municipio, Puerto Rico",57427.0,46345,6515,1587,3962.0,2548.0,0.0,0.0,4479.0,3475.0,...,35.4,35.4,37.8,64.6,80.70,11.34,2.76,Puerto Rico,Guaynabo Municipio,2010
"Ponce Municipio, Puerto Rico",59582.0,50197,5790,661,6613.0,5371.0,613.0,0.0,7149.0,5270.0,...,24.6,24.9,25.9,27.2,84.25,9.72,1.11,Puerto Rico,Ponce Municipio,2010


In [8]:
categories = ['commute','age']
df_dict = {}

for cat in categories:
    df = pd.DataFrame()
    print('RESET DF')
    
    for i in range(2010,2023):
        if i != 2020:
            print(cat,i,len(df.index))
            path = f'../data/unfiltered_data/{i}_{cat}_county.csv'
            subset = configure(path)
            df = pd.concat([df,subset])
            print(len(df.index))
        #end
    
    df_dict[cat] = df
#end

RESET DF
commute 2010 0
535
commute 2011 535
1092
commute 2012 1092
1686
commute 2013 1686
2271
commute 2014 2271
2856
commute 2015 2856
3436
commute 2016 3436
4001
commute 2017 4001
4539
commute 2018 4539
5069
commute 2019 5069
5595
commute 2021 5595
6054
commute 2022 6054
6587
RESET DF
age 2010 0
818
age 2011 818
1640
age 2012 1640
2465
age 2013 2465
3293
age 2014 3293
4121
age 2015 4121
4951
age 2016 4951
5782
age 2017 5782
6619
age 2018 6619
7457
age 2019 7457
8297
age 2021 8297
9138
age 2022 9138
9986


In [9]:
comm = df_dict['commute']
comm.to_csv('../data/unfiltered_data/comm.csv')

In [10]:
age = df_dict['age']
age.to_csv('../data/unfiltered_data/age.csv')

In [None]:
age.loc[age.year == 2022]['workers'].sum()

In [None]:
import seaborn as sns

In [None]:
work_year = age.groupby('year')['workers'].sum().to_frame()
work_year

In [None]:
sns.lineplot(x = work_year.index, y = work_year.workers);