In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def clean_commute(path):
    ### read in data
    data = pd.read_csv(path)
    
    ### create name of file for saving
    year = int(path.split('_data/')[1].split('_Data')[0])
    file = str(year) + '_commute'
    
               
    ### get rid of specific columns
    #print('Get rid of specific columns')
    pattern1 = r'Margin of Error'
    pattern2 = r'PERCENT ALLOCATED'
    pattern3 = r'Unnamed:'

    drop = []
    
    for col in data.columns:
        if (re.search(pattern1,col) or re.search(pattern2,col) or re.search(pattern3,col)):
            drop.append(col)
    #end
    
    data = data.drop(columns = drop)
    
    ### clean up columns names
    #print('Clean up columns names')
    new_cols = []

    for col in data.columns:
        new_col = col.replace('Estimate','').replace('Total', '').replace('!!',' ').replace('  ',' ').strip()
        new_cols.append(new_col)
    #end
    
    data.columns = new_cols
    
    
    ### ALSO CHECK IF THIS COLUMN EXISTS!
    #print('Determine j')
    
    if year > 2018:
        j = 1 # this indicates that my original code will work with this data
        #print('j is',j)

    elif year == 2018:
        j = 2
        #print('j is',j)

    else:
        j = 0
        #print('j is',j)

    
    ### keep specific commute related columns
    #print('Keep specific commute related columns')
    pattern1 = r'TRAVEL TIME'
    
    if j%2 == 0:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work at home',
                'Car, truck, or van -- drove alone Workers 16 years and over','Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
    else:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work from home',
                'Car, truck, or van -- drove alone Workers 16 years and over','Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
    
    for col in data.columns:
        if re.search(pattern1,col):
            keep.append(col)
    #end
    
    data = data[keep]

    
    ### Set county as index
    #print('Set county as index')

    data = data.set_index('Geographic Area Name')
    
    
    ### replace N and - with NaN
    #print('Replace N and - with NaN')
    data = data.replace(to_replace='N',value=np.nan).replace(to_replace = '-', value = np.nan).replace('**',np.nan)
    
    data = data.dropna(thresh = 25)
    
    
    ### make dtypes numeric
    #print('Make df numeric')
    data = data.apply(pd.to_numeric)
    
    
    ### Clean up column names some more
    pattern1 = r'Workers 16 years and over who did not work from home'
    pattern0 = r'Workers 16 years and over who did not work at home'
    pattern2 = r'Estimate'
    pattern3 = r'(excluding taxicab)'

    new_cols = {}

    for col in data.columns:
        if (col != 'Workers 16 years and over who did not work from home' and
            col != 'Workers 16 years and over who did not work at home') and (re.search(pattern0,col) 
                                                                              or re.search(pattern1,col) 
                                                                              or re.search(pattern2,col) 
                                                                              or re.search(pattern3,col)):
            new_col = col.replace(str(pattern0),'').replace(str(pattern1),'').replace(str(pattern2),'').replace(str(pattern3),'').strip()
            new_cols[col] = new_col
    #end
    
    data = data.rename(columns = new_cols)
    
    ### save county file
    data.to_csv(f'../data/pct_data/{file}_county.csv')
    
    return data

In [None]:
for i in range (2010,2023):
    if i != 2020:
        print(i)
        
        try:
            path = f'../data/census_data/{i}_Data.csv'
            clean_commute(path)
            print('Completed')
        except:
            print('Skipped')
            continue
#end

In [None]:
def clean_age(path):
    ### read in data
    data = pd.read_csv(path)
    
    
    ### create name of file for saving
    year = int(path.split('_data/')[1].split('_Data')[0])
    file = str(year) + f'_age'
    
    ### get rid of specific columns
    #print('Get rid of specific columns')
    pattern1 = r'Margin of Error'
    pattern2 = r'PERCENT ALLOCATED'
    pattern3 = r'Unnamed:'

    drop = []
    
    for col in data.columns:
        if (re.search(pattern1,col) or re.search(pattern2,col) or re.search(pattern3,col)):
            drop.append(col)
    #end
    
    data = data.drop(columns = drop)
    
    ### clean up columns names
    #print('Clean up columns names')
    new_cols = []

    for col in data.columns:
        new_col = col.replace('Estimate','').replace('Total', '').replace('!!',' ').replace('  ',' ').strip()
        new_cols.append(new_col)
    #end
    
    data.columns = new_cols
    
    
    ### replace N and - with NaN
    #print('Replace N and - with NaN')
    data = data.replace(to_replace='N',value=np.nan).replace(to_replace = '-', value = np.nan).replace('**',np.nan)
    
    
    ### ALSO CHECK IF THIS COLUMN EXISTS!
    #print('Determine j')
    
    if year > 2018:
        j = 1 # this indicates that my original code will work with this data
        #print('j is',j)
        
    elif year == 2018:
        j = 2
        #print('j is',j)
    
    else:
        j = 0
        #print('j is',j)
    
    
    ### keep specific commute related columns
    #print('Keep specific commute related columns')

    if j%2 == 0: # that is, j = 0 or j = 2
        pattern1 = r'AGE.\d'
        pattern2 = r'Median age'
    else:
        pattern1 = r'over.AGE'
        pattern2 = r'PLACEHOLDER TEXT' # There wasn't any need for a second pattern but I needed the code to be consistent
    
    if j%2 == 0:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work at home',
                'Car, truck, or van -- drove alone Workers 16 years and over', 'Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
    else:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work from home',
                'Car, truck, or van -- drove alone Workers 16 years and over','Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
        
    for col in data.columns:
        if re.search(pattern1,col) or re.search(pattern2,col):
            keep.append(col)
    #end
    
    data = data[keep]

    
    ### Set county as index
    data = data.set_index('Geographic Area Name')
    
    
    ### make dtypes numeric
    data = data.apply(pd.to_numeric)
    
    
    ### Clean up column names some more
    pattern0 = r'Workers 16 years and over who did not work at home'
    pattern1 = r'Workers 16 years and over who did not work from home'
    pattern2 = r'Estimate'
    pattern3 = r'(excluding taxicab)'

    new_cols = {}

    for col in data.columns:
        if (col != 'Workers 16 years and over who did not work from home' 
            and col != 'Workers 16 years and over who did not work from home') and (re.search(pattern1,col) 
                                                                                 or re.search(pattern2,col) 
                                                                                 or re.search(pattern3,col)):
            new_col = col.replace(str(pattern1),'').replace(str(pattern2),'').replace(str(pattern3),'').strip()
            new_cols[col] = new_col
    #end
    
    data = data.rename(columns = new_cols)
    
    ### save county file
    data.to_csv(f'../data/pct_data/{file}_county.csv')
    
    return data

In [None]:
for i in range (2010,2023):
    if i != 2020:
        print(i)
        
        try:
            path = f'../data/census_data/{i}_Data.csv'
            clean_age(path)
            print('Completed')
        except:
            print('Skipped')
            continue
#end

In [None]:
def configure(path):
    df = pd.read_csv(path)
    year = int(path.split('_data/')[1].split('_')[0])
    csv_type = path.split(str(year)[1].split('_county')[0])
    
    if csv_type == 'commute':
        csv_type = 'comm'
    
    
    ### set county as index
    df = df.set_index('Geographic Area Name')
    
    
    ### drop total column since we're only interested in people who travel
    df = df.drop(columns = 'Workers 16 years and over')
        
    
    ### change column names
    new_cols = []

    for col in df.columns:
        if col == 'Workers 16 years and over who did not work from home' or col == 'Workers 16 years and over who did not work at home':
            #print('FOUND')
            new_col = 'workers'
            new_cols.append(new_col)
        
        else:
            # The great replacening
            new_col = str.lower(col)
            new_col = new_col.replace(' 16 years and over','').replace('car, truck, or van -- ','').strip()
            new_col = new_col.replace('travel time to work ','').replace(' to ','_').replace('less than ','').strip()
            new_col = new_col.replace('-work (minutes)','').replace(' or more','').strip()
            new_col = new_col.replace('public transportation','pub_transit').replace(' workers','').replace('minutes','min').strip()
            new_col = new_col.replace('(years)','').replace(' years','').replace('  ',' ').strip().replace(' ','_')
            new_col = new_col.replace('__','_').replace('_who_did_not_work_at_home','').replace('_who_did_not_work_from_home','')
            if year >= 2018:
                new_col = new_col.replace('age_median','median')
            #end
            new_col = new_col.replace('(min)','min')
            new_col = re.sub(r'workers_(\d)',r'\1',new_col)
            new_col = new_col.replace('workers_mean','mean').replace('workers_age','age').replace('workers_median','median')
            new_cols.append(new_col)
        #end
    
    df.columns = new_cols
    
    
    ### change index name
    df.index.name = 'county_name'
    
    
    ### add percentage columns
    df['alone_pct']   = round( (df['drove_alone'] / df['workers']) * 100, 2)
    df['carpool_pct'] = round( (df['carpooled'] / df['workers']) * 100, 2)
    df['transit_pct'] = round( (df['pub_transit'] / df['workers']) * 100, 2)
    
    
    ### add state and year columns
    states = []
    counties = []
    
    for county_name in df.index:
        split = county_name.split(', ')
        state = split[1]
        county = split[0]
        states.append(state)
        counties.append(county)
    #end
    
    df['state'] = states
    df['county'] = counties
    df['year'] = year
    
    return df

In [None]:
categories = ['commute','age']
df_dict = {}

for cat in categories:
    df = pd.DataFrame()
    print('RESET DF')
    
    for i in range(2010,2023):
        if i != 2020:
            print(cat,i,len(df.index))
            path = f'../data/unfiltered_data/{i}_{cat}_county.csv'
            subset = configure(path)
            df = pd.concat([df,subset])
            print(len(df.index))
        #end
    
    df_dict[cat] = df
#end

In [None]:
comm = df_dict['commute']
comm.to_csv('../data/pct_data/comm.csv')

In [None]:
age = df_dict['age']
age.to_csv('../data/pct_data/age.csv')