In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
commute = pd.read_csv('../data/census_data/2021_Data.csv')
commute

Let's get rid of those pesky Error and Percent Allocated columns!

In [None]:
pattern  = re.compile('Margin of Error')
pattern2 = re.compile('PERCENT ALLOCATED')
drop = []

for col in commute.columns:
    if re.search(pattern,col):
        #print(col)
        drop.append(col)
    elif re.search(pattern2,col):
        #print(col)
        drop.append(col)
#end

In [None]:
commute = commute.drop(columns = drop)

And the random Unnamed column while we're at it.

In [None]:
commute = commute.drop(columns = 'Unnamed: 778')
#commute

Having the letter "N" doesn't help... let's replace it with NaN.

In [None]:
commute = commute.replace(to_replace='N',value=np.nan)
#commute

And let's clean up those columns names! There's a lot going on with them.

In [None]:
new_cols = []

for col in commute.columns:
    new_col = col.replace('Estimate!!Total!!', '').replace('!!',' ').strip()
    new_cols.append(new_col)

In [None]:
new_cols

In [None]:
commute.columns = new_cols
#commute

Pull the columns strictly related to commute times.

In [None]:
pattern = re.compile('TRAVEL TIME')
drop = []

for col in commute.columns:
    if (not re.search(pattern,col)) and ((col != 'Geographic Area Name') and (col != 'Workers 16 years and over')
                                        and (col != 'Workers 16 years and over who did not work from home') # These columns needed to stay
                                        and (col != 'Estimate Car, truck, or van -- drove alone Workers 16 years and over')
                                        and (col != 'Estimate Car, truck, or van -- carpooled Workers 16 years and over')
                                        and (col != 'Estimate Public transportation (excluding taxicab) Workers 16 years and over')):
        drop.append(col)

commutes = commute.drop(columns = drop)

In [None]:
commutes

In [None]:
commutes = commutes.set_index('Geographic Area Name')
#commutes

Checking my home county just to make sure it works as intended.

In [None]:
commutes.loc['Montgomery County, Maryland']

In [None]:
commutes.dtypes

## The data types are wrong. This will have to be something that will be cleaned up.

And we can't groupby anything because apparently there are '-'  floating around...

In [None]:
commutes = commutes.replace(to_replace = '-', value = np.nan)

In [None]:
commutes = commutes.apply(pd.to_numeric)
commutes.dtypes

In [None]:
commutes.head(12)

With that done, let's make a 'State' column (we'll group by it later).

In [None]:
states = []

for ind, values in commutes.iterrows():
    state = ind.split(', ')[1]
    #print(state)
    states.append(state)

In [None]:
commutes['State'] = states
#commutes

In [None]:
commutes.groupby('State').mean()

In [None]:
#pd.set_option('display.max_columns', None)
#commutes

There are a lot of counties that have mostly null values... Let's drop those rows.

In [None]:
commutes_nan = commutes.dropna(thresh = 30)
commutes_nan

In [None]:
commutes_nan.groupby('State').mean()

I need to be careful with how I approach this data now. Taking averages of percents can be useful (I literally did a mathematical proof that shows that taking averages of percentages will still yield a row that sums to 100%), but if each sample size is different, percentages will become skewed and meaningless.

Averages are fine for the mean travel times, but other than that, they should be avoided. I would like to get rid of the percentages by multiplying the total number of people to the percentage, but this dataset doesn't actually give me a breakdown of how many people commute, much less which category they fall under. I can maybe try looking for this info, but I'm not sure I would find it. Perhaps I should start focusing somewhere else.

# OF COURSE THE DATA WAS ALREADY IN THE DATASET. I had simply dropped it earlier. *Now revised: See Cell 10*

Well, this should be fun then.

First, I need to find a way to get rid of the percentages and put in the actual number of people. This way, I can take averages without skewing anything.

In [None]:
for row in commutes_nan.itertuples():
    #print('THE ROW INDEX IS:',row.Index)
    
    comm_total  = row[2]
    drove_alone = row[13]
    carpool     = row[24]
    pub_transit = row[35]

    for i in range(3,12):
        #print('\n',i)
        try:
            new_val = int(comm_total*(row[i]/100))
            #print(f'{comm_total} times {row[i]/100} equals {new_val}\t{commutes_nan.columns[i-1]}')
            commutes_nan.loc[row.Index, commutes_nan.columns[i-1]] = new_val
        except:
            #print(f'{comm_total} times {row[i]/100} equals {np.nan}\t{commutes.columns[i-1]}')
            commutes_nan.loc[row.Index, commutes_nan.columns[i-1]] = np.nan
    
    
    for i in range(14,23):
        #print('\n',i)
        try:
            new_val = int(drove_alone*(row[i]/100))
            #print(f'{drove_alone} times {row[i]/100} equals {new_val}\t{commutes_nan.columns[i-1]}')
            commutes_nan.loc[row.Index, commutes_nan.columns[i-1]] = new_val
        except:
            #print(f'{drove_alone} times {row[i]/100} equals {np.nan}\t{commutes_nnan.columns[i-1]}')
            commutes_nan.loc[row.Index, commutes_nan.columns[i-1]] = np.nan
    
    
    for i in range(25,34):
        #print('\n',i)
        try:
            new_val = int(carpool*(row[i]/100))
            #print(f'{carpool} times {row[i]/100} equals {new_val}\t{commutes_nan.columns[i-1]}')
            commutes_nan.loc[row.Index, commutes_nan.columns[i-1]] = new_val
        except:
            #print(f'{carpool} times {row[i]/100} equals {np.nan}\t{commutes_nan.columns[i-1]}')
            commutes_nan.loc[row.Index, commutes_nan.columns[i-1]] = np.nan
            
            
    for i in range(36,45):
        #print('\n',i)
        try:
            new_val = int(pub_transit*(row[i]/100))
            #print(f'{pub_transit} times {row[i]/100} equals {new_val}\t{commutes_nan.columns[i-1]}')
            commutes_nan.loc[row.Index, commutes_nan.columns[i-1]] = new_val
        except:
            #print(f'{pub_transit} times {row[i]/100} equals {np.nan}\t{commutes_nan.columns[i-1]}')
            commutes_nan.loc[row.Index, commutes_nan.columns[i-1]] = np.nan
    

In [None]:
#commutes_nan

Since a lot of these rows are null, I'm gonna drop any that are mostly null.

In [None]:
commutes_nan = commutes.dropna(thresh = 30)
#commutes_nan

Now, let's group by State.

In [None]:
commutes_avg = commutes_nan.groupby('State').mean().astype(int)
commutes_avg

In [None]:
commutes_avg.sort_values('Workers 16 years and over who did not work from home TRAVEL TIME TO WORK Mean travel time to work (minutes)', ascending = False)

Cleaning up the columns names... again.

In [None]:
pattern1 = r'Workers 16 years and over who did not work from home'
pattern2 = r'Estimate'
pattern3 = r'(excluding taxicab)'

new_cols = {}

for col in commutes_avg.columns:
    if (col != 'Workers 16 years and over who did not work from home') and (re.search(pattern1,col) 
                                                                            or re.search(pattern2,col) 
                                                                            or re.search(pattern3,col)):
        new_col = col.replace(str(pattern1),'').replace(str(pattern2),'').replace(str(pattern3),'').strip()
        new_cols[col] = new_col
#end

In [None]:
commutes_avg = commutes_avg.rename(columns = new_cols)
commutes_avg

My goal is to now do this process for every dataset I have access to... which would be easy with a function.

In [None]:
def avg_commute(path):
    ### read in data
    data = pd.read_csv(path)
    
    
    ### create name of file for saving
    num = path.split('_data/')[1].split('.csv')[0]
    file = num + '_Commute'
    
    ### get rid of specific columns
    #print('Get rid of specific columns')
    pattern1 = r'Margin of Error'
    pattern2 = r'PERCENT ALLOCATED'
    pattern3 = r'Unnamed:'

    drop = []
    
    for col in data.columns:
        if (re.search(pattern1,col) or re.search(pattern2,col) or re.search(pattern3,col)):
            drop.append(col)
    #end
    
    data = data.drop(columns = drop)
    
    ### clean up columns names
    #print('Clean up columns names')
    new_cols = []

    for col in data.columns:
        new_col = col.replace('Estimate','').replace('Total', '').replace('!!',' ').replace('  ',' ').strip()
        new_cols.append(new_col)
    #end
    
    data.columns = new_cols
    
    
    ### remove rows that are mostly null or where 'Workers 16 years and over who did not work from home' is null
    ### ALSO CHECK IF THIS COLUMN EXISTS!
    #print('Determine j')
    data = data.dropna(thresh = 30)
    
    if 'Workers 16 years and over who did not work from home' in data.columns:
        j = 1 # this indicates that my original code will work with this data
        print('j is',j)
        
        drop = []
        for ind, values in data.iterrows():
            if pd.isna(values['Workers 16 years and over who did not work from home']):
                drop.append(ind)
    else:
        j = 0
        print('j is',j)
        drop = []
        
        for ind, values in data.iterrows():
            if pd.isna(values['Workers 16 years and over']):
                drop.append(ind)
    #end
    
    data = data.drop(index = drop)    
    
    
    ### keep specific commute related columns
    #print('Keep specific commute related columns')
    if j == 0:
        pattern1 = r'TRAVEL TIME'
    else:
        pattern1 = r'TRAVEL TIME'
    
    if j == 0:
        keep = ['Geographic Area Name','Workers 16 years and over','Car, truck, or van -- drove alone Workers 16 years and over',
                'Car, truck, or van -- carpooled Workers 16 years and over','Public transportation (excluding taxicab) Workers 16 years and over']
    else:
        keep = ['Geographic Area Name','Workers 16 years and over','Workers 16 years and over who did not work from home',
                'Car, truck, or van -- drove alone Workers 16 years and over','Car, truck, or van -- carpooled Workers 16 years and over',
                'Public transportation (excluding taxicab) Workers 16 years and over']
    
    for col in data.columns:
        if re.search(pattern1,col):
            keep.append(col)
    #end
    
    data = data[keep]

    
    ### Set county as index
    #print('Set county as index')

    data = data.set_index('Geographic Area Name')
    
    
    ### replace N and - with NaN
    #print('Replace N and - with NaN')
    data = data.replace(to_replace='N',value=np.nan).replace(to_replace = '-', value = np.nan).replace('**',np.nan)
    
    data = data.dropna(thresh = 25)
    
    
    ### make dtypes numeric
    #print('Make df numeric')
    data = data.apply(pd.to_numeric)
    
    
    ### get rid of percentages by translating them to integers BASED ON j
    if j == 0:
        for row in data.itertuples():
            #print('\tROW IS NOW:\t',row.Index)
            comm_total  = row[1]
            drove_alone = row[2]
            carpool     = row[3]
            pub_transit = row[4]

            for i in range(5,41):
                #print('i is',i)
                if i%4 == 1: # total
                    try:
                        #print('--> COMM_TOTAL\nCOMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(comm_total*(row[i]/100))
                        #print('CHECK:',comm_total,'times',row[i]/100,'equals',new_val)
                        
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                if i%4 == 2: # drove alone
                    try:
                        #print('--> DROVE_ALONE\nCOMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(drove_alone*(row[i]/100))
                        #print('CHECK:',drove_alone,'times',row[i]/100,'equals',new_val)
                        
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                if i%4 == 3: # carpool
                    try:
                        #print('--> CARPOOL\nCOMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(carpool*(row[i]/100))
                        #print('CHECK:',carpool,'times',row[i]/100,'equals',new_val)
                        
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
                if i%4 == 0: # public transit
                    try:
                        #print('--> PUB_TRANSIT\nCOMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                        new_val = int(pub_transit*(row[i]/100))
                        #print('CHECK:',pub_transit,'times',row[i]/100,'equals',new_val)
                        
                        data.loc[row.Index, data.columns[i-1]] = new_val
                    except:
                        data.loc[row.Index, data.columns[i-1]] = np.nan
                    #end
            #end
    elif j == 1:
        for row in data.itertuples():
            #print('ROW IS:',row.Index)
            comm_total  = row[2]
            drove_alone = row[3]
            carpool     = row[4]
            pub_transit = row[5]

            for i in range(6,15):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(comm_total*(row[i]/100))
                    #print('CHECK:',comm_total,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan


            for i in range(16,25):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(drove_alone*(row[i]/100))
                    #print('CHECK:',drove_alone,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan


            for i in range(26,35):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(carpool*(row[i]/100))
                    #print('CHECK:',carpool,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan


            for i in range(36,45):
                #print('i is',i)
                try:
                    #print('COMPARE:',row[i],'to',data.loc[row.Index, data.columns[i-1]])
                    new_val = int(pub_transit*(row[i]/100))
                    #print('CHECK:',pub_transit,'times',row[i]/100,'equals',new_val)
                    
                    data.loc[row.Index, data.columns[i-1]] = new_val
                except:
                    data.loc[row.Index, data.columns[i-1]] = np.nan
        #end
    
    
    ### Clean up column names some more
    pattern1 = r'Workers 16 years and over who did not work from home'
    pattern2 = r'Estimate'
    pattern3 = r'(excluding taxicab)'

    new_cols = {}

    for col in data.columns:
        if (col != 'Workers 16 years and over who did not work from home') and (re.search(pattern1,col) 
                                                                                or re.search(pattern2,col) 
                                                                                or re.search(pattern3,col)):
            new_col = col.replace(str(pattern1),'').replace(str(pattern2),'').replace(str(pattern3),'').strip()
            new_cols[col] = new_col
    #end
    
    data = data.rename(columns = new_cols)
    
    
    ### save county file
    data.to_csv(f'../data/cleaned_data/{file}_County_Sum.csv')
    
    
    ### set up aggregations for group by
    aggs = {}
    
    for col in data.columns:        
        num = data.columns.get_loc(col)
        
        if j == 0:
            if num < 39:
                aggs[col] = np.sum
            else:
                aggs[col] = np.mean
            #end
        elif j == 1:
            if num in [14,24,34,44]:
                aggs[col] = np.mean
            else:
                aggs[col] = np.sum
    #end
    
    
    ### add State column (so we can group by it)
    states = []

    for ind, values in data.iterrows():
        state = ind.split(', ')[1]
        states.append(state)
    #end
    
    data['State'] = states
    
    
    ### finally, group by state and return df
    
    data = data.groupby('State').agg(aggs).replace(np.nan, 0).astype(int)#.replace(to_replace=0,value=np.nan)
    
    data.to_csv(f'../data/cleaned_data/{file}_State_Sum.csv')
    
    return data

In [None]:
df = avg_commute('../data/census_data/2010_Data.csv')
#df

In [None]:
df

The first few times I ran this, the number of people driving alone would be larger than the total of people who didn't work from home... which doesn't make any sense. I had to investigate for a bit, but I found out that there were several rows that had NaN for total workers not working at home but some number for driving alone. I decided to remove these columns to make the data as uniform as possible and have as few errors as possible (even though some data is now useless, but I believe it's worth it).

I first checked to see if using a list instead of an index would work.

In [None]:
list(commutes_avg.columns[0:2])

But it did not.

I then checked to see where total workers (including at home workers) was null.

In [None]:
commutes.loc[pd.isna(commutes['Workers 16 years and over'])]

But silly me, that's not the column I was interested in nor the table I needed to check! I think this is why I was going in circles for a bit.

I decided to see if there was any row where this contradiciton happened (more drivers than workers travelling).

In [None]:
commutes_avg.loc[commutes_avg['Car, truck, or van -- drove alone Workers 16 years and over']
            > commutes_avg['Workers 16 years and over who did not work from home']]

Nothing... that's wasn't very helpful.

Since I had come up with the separate aggregations for grouping by within the function, I tried it out here to see if anything changed.

In [None]:
aggs = {}

for col in commutes_nan.columns:
    num = commutes_nan.columns.get_loc(col)
    if (num < 2 or num == 12 or num == 23 or num == 34):
        aggs[col] = np.sum
    elif num < 45:
        aggs[col] = np.mean
#end

In [None]:
commutes_agg = commutes_nan.groupby('State').agg(aggs)
#commutes_agg

But nothing changed.

I then checked the df I brought in with the function to see if *that* did anything.

In [None]:
#df.loc[df['Estimate Car, truck, or van -- drove alone Workers 16 years and over']
#            > df['Workers 16 years and over who did not work from home']]

And of course it didn't.

Finally, I decided to replace all NaN's with 0's and then compare. Maybe this would get me somewhere...

In [None]:
commutes_zero = commutes_nan.replace(to_replace=np.nan, value = 0)
#commutes_zero

In [None]:
commutes_zero.loc[commutes_zero['Estimate Car, truck, or van -- drove alone Workers 16 years and over']
            > commutes_zero['Workers 16 years and over who did not work from home']]

#### *AHA!* There's the problem!

Many of these rows didn't have anything listed for workers who had to travel, but whenever I would try comparing them numerically, nothing showed up because they were NaN: 'Not a Number'

So with this in mind, I went to remove any row where this column was null. That was able to fix it :)

The last step for this notebook is to clean each table and save it so we can pull it into a fresh new notebook.

In [None]:
for i in range (2010,2023):
    if i != 2020:
        print(i)
        
        try:
            path = f'../data/census_data/{i}_Data.csv'
            new_df = avg_commute(path)
            #new_df.to_csv(f'../data/cleaned_data/{i}_Data_State.csv')
            print('Completed')
        except:
            print('Skipped')
            continue
#end

Some things went wrong... It's time to take a closer look at each one.

In [None]:
df = avg_commute('../data/census_data/2010_Data.csv')

In [None]:
data = pd.read_csv('../data/census_data/2010_Data.csv')
#data

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
### get rid of specific columns
pattern1 = r'Margin of Error'
pattern2 = r'PERCENT ALLOCATED'
pattern3 = r'Unnamed:'

drop = []
  
for col in data.columns:
    if (re.search(pattern1,col) or re.search(pattern2,col) or re.search(pattern3,col)):
        drop.append(col)
#end
    
data = data.drop(columns = drop)
 
### clean up columns names
new_cols = []

for col in data.columns:
    new_col = col.replace('Estimate', '').replace('Total','').replace('!!',' ').replace('  ',' ').strip()
    new_cols.append(new_col)
#end
    
data.columns = new_cols


### keep specific commute related columns
pattern = r'TRAVEL TIME'
drop = []

for col in data.columns:
    if (not re.search(pattern,col)
        and ((col != 'Geographic Area Name') 
        and (col != 'Workers 16 years and over')
        and (col != 'Workers 16 years and over who did not work from home')
        and (col != 'Car, truck, or van -- drove alone Workers 16 years and over')
        and (col != 'Car, truck, or van -- carpooled Workers 16 years and over')
        and (col != 'Public transportation (excluding taxicab) Workers 16 years and over'))):
        drop.append(col)
#end

data = data.drop(columns = drop)


### Set county as index
data = data.set_index('Geographic Area Name')

    
### replace N and - with NaN
data = data.replace(to_replace='N',value=np.nan).replace(to_replace = '-', value = np.nan).replace('**',np.nan)
    
    
### remove rows that are mostly null or where 'Workers 16 years and over who did not work from home' is null
data = data.dropna(thresh = 30)

drop = []
if 'Workers 16 years and over who did not work from home' in data.columns:
    j = 1 # this indicates that my original code will work with this data
        
    for ind, values in data.iterrows():
        if pd.isna(values['Workers 16 years and over who did not work from home']):
            drop.append(ind)
            
elif 'Workers 16 years and over who did not work at home' in data.columns:
    j = 2 # this indicates that my original code will *almost* work with this data
        
    for ind, values in data.iterrows():
        if pd.isna(values['Workers 16 years and over who did not work at home']):
            drop.append(ind)
else:
    j = 0
#end
    
data = data.drop(index = drop)
    
    
### make dtypes numeric
data = data.apply(pd.to_numeric)

print('j is',j)

data

In [None]:
list(data.columns)

**Interesting Note:** this dataset does not have workers *who did not work from home* listed. I guess that makes sense because remote work was not nearly as common then as it is now (particularly because of the pandemic). Because of this difference, I'll have to adjust the function to account for this missing column.

In [None]:
data

In [None]:
for row in data.itertuples():
    comm_total  = row[1]
    drove_alone = row[2]
    carpool     = row[3]
    pub_transit = row[4]
    
    for i in range(5,41):
        if i%4 == 1: # total
            try:
                new_val = int(comm_total*(row[i]/100))
                data.loc[row.Index, data.columns[i-1]] = new_val
            except:
                data.loc[row.Index, data.columns[i-1]] = np.nan
            #end
        if i%4 == 2: # drove alone
            try:
                new_val = int(drove_alone*(row[i]/100))
                data.loc[row.Index, data.columns[i-1]] = new_val
            except:
                data.loc[row.Index, data.columns[i-1]] = np.nan
            #end
        if i%4 == 3: # carpool
            try:
                new_val = int(carpool*(row[i]/100))
                data.loc[row.Index, data.columns[i-1]] = new_val
            except:
                data.loc[row.Index, data.columns[i-1]] = np.nan
            #end
        if i%4 == 0: # public transit
            try:
                new_val = int(pub_transit*(row[i]/100))
                data.loc[row.Index, data.columns[i-1]] = new_val
            except:
                data.loc[row.Index, data.columns[i-1]] = np.nan
            #end

This code helped for the most part, but 2015 and 2018 are being difficult.

2015's table name was incorrect! At least it was an easy fix.

2018 on the other hand... let's read in the file.

In [None]:
data

In [None]:
data = pd.read_csv('../data/census_data/2018_Data.csv', header = 0)

In [None]:
data_csv = avg_commute('../data/census_data/2018_Data.csv')
data_csv

In [None]:
data

For some reason, that fixed it. Not sure why, but it did.