In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
age  = pd.read_csv('../data/cleaned_data/age.csv')
comm = pd.read_csv('../data/cleaned_data/comm.csv')

age  = age.set_index('county')
comm = comm.set_index('county')

In [None]:
age

In [None]:
comm

In [None]:
data = pd.merge(comm,age,'outer',on = ['county','workers','drove_alone','carpooled',
                                      'pub_transit','state','year'])

In [None]:
data

In [None]:
pattern = r'mean'
keep = ['workers']
for col in data.columns:
    if re.search(pattern,col):
        keep.append(col)

In [None]:
avg_comm = data[keep].groupby(data.index).mean()
avg_comm

In [None]:
avg_comm.sort_values('mean_min', ascending = False)

In [None]:
avg_comm.loc[avg_comm['pub_transit_mean_min'] < avg_comm['mean_min']].sort_values('pub_transit_mean_min')

In [None]:
comm.loc['Lapeer County, Michigan']

In [None]:
avg_comm.corr()

In [None]:
def heatmap(df, center = 0):
    fig, ax = plt.subplots(figsize=(6, 6))

    corr = df.corr(numeric_only = True)
    # create a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Generate a mask for the upper triangle (since a correlation matrix is diagonally symmetric)
    mask = np.zeros_like(corr, dtype=bool)
    mask[np.triu_indices_from(mask)] = True

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, cmap=cmap, mask = mask, center=center,
                square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
heatmap(avg_comm, center = 0.4)

### The mean travel time is heavily correlated with the mean travel time of those who drive alone! This may be because the vast majority of people who commute drive alone, meaning this group of people has the greatest impact on the overall average travel time.

In [None]:
pattern2 = r'median'
keep = ['workers']

for col in data.columns:
    if re.search(pattern2,col):
        keep.append(col)

In [None]:
avg_age = data[keep].groupby(data.index).mean()
#avg_age

In [None]:
avg_age.corr()

In [None]:
heatmap(avg_age, center = 0.1)

### Likewise, median age is heavily correlated with the median age of those who drive alone.

In [None]:
keep = ['workers']
for col in data.columns:
    if re.search(pattern,col) or re.search(pattern2,col):
        keep.append(col)

In [None]:
data_avg = data[keep]

In [None]:
data_avg

In [None]:
data_avg.corr()

In [None]:
heatmap(data_avg, center = 0.1)

### From this, it looks like there isn't a correlation between age and commute time.

Let's investigate that further.

In [None]:
age.describe().astype(int)

In [None]:
def heatmap_2(data, category, center = 0.1):
    pattern = re.compile(category)
    keep = ['workers']
    
    for col in data.columns:
        if re.search(pattern,col):
            keep.append(col)
    #end
    
    data = data[keep]
    
    data_corr = data.corr().loc[category+'_age_16_19':category+'_age_60_and_over',
                                category+'_10_min':category+'_60_min']
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    
    sns.heatmap(data_corr, cmap = cmap, center = center)

In [None]:
heatmap_2(data,'drove_alone', center = .9)

In [None]:
heatmap_2(data,'carpooled',center=0.9)

In [None]:
heatmap_2(data,'pub_transit',center=.9)

In [None]:
drove = r'drove_alone'
keep = ['workers']

for col in data.columns:
    if re.search(drove,col):
        keep.append(col)

In [None]:
data[keep].describe().astype(int)

In [None]:
alone = data[keep]
alone

In [None]:
alone_corr = alone.corr().loc['drove_alone_age_16_19':'drove_alone_age_60_and_over',
                              'drove_alone_10_min':'drove_alone_60_min']
alone_corr

In [None]:
heatmap(alone, center = .5)

In [None]:
sns.heatmap(alone_corr, cmap = sns.diverging_palette(220, 10, as_cmap=True), center = .9);

In [None]:
drove = r'carpool'
keep = ['workers']

for col in data.columns:
    if re.search(drove,col):
        keep.append(col)
#end

carpool = data[keep]

In [None]:
carpool

In [None]:
carpool.corr()

In [None]:
carpool_corr = carpool.corr().loc['carpooled_age_16_19':'carpooled_age_60_and_over',
                              'carpooled_10_min':'carpooled_60_min']
carpool_corr

In [None]:
sns.heatmap(carpool_corr, cmap = sns.diverging_palette(220, 10, as_cmap=True), center = .885);

In [None]:
pattern = r'pub_'
keep = ['workers']

for col in data.columns:
    if re.search(pattern,col):
        keep.append(col)
#end

pub_transit = data[keep]

pub_transit_corr = pub_transit.corr().loc['pub_transit_age_16_19':'pub_transit_age_60_and_over',
                                          'pub_transit_10_min':'pub_transit_60_min']
sns.heatmap(pub_transit_corr, cmap = sns.diverging_palette(220, 10, as_cmap=True), center = .9);

In [None]:
pub_transit_corr

### In the end, it doesn't look like there's a clear correlation between age and commute time.

## Next, I want to see how stats changed over years.

In [None]:
years_avg = data.groupby('year').mean(numeric_only = True).round(2)
years_avg

In [None]:
pattern  = r'mean'
#pattern2 = r'median'
keep = ['workers']

for col in years_avg.columns:
    if re.search(pattern,col):# or re.search(pattern2,col):
        keep.append(col)
#end

In [None]:
year_times = years_avg[keep]
year_times

In [None]:
fig, ax = plt.subplots(1, figsize=(20,12))


for col in year_times.columns:
    if col != 'workers':
        line, = plt.plot(year_times[col], marker='o', label = str.capitalize(col.replace('_',' ').replace('min','time')),
                         linewidth = 3)
        plt.legend(handles = [plt.plot([],ls="-", color=line.get_color())[0]],
                   labels=[line.get_label()])
#end

plt.xlabel('Year', size = 20)
plt.ylabel('Travel Time (min)', size = 20)
ax.legend(labels = labels, handles  = "-", 
          fontsize = 25, loc='center left', bbox_to_anchor=(1, 0.5))
ax.tick_params(axis = 'both', labelsize = 20);
# help from: https://stackoverflow.com/questions/48391146/change-marker-in-the-legend-in-matplotlib

In [None]:
pattern2 = r'median'
keep = ['workers']

for col in years.columns:
    if re.search(pattern2,col):
        keep.append(col)
#end

year_ages = years[keep]

In [None]:
fig, ax = plt.subplots(1, figsize=(20,12))


for col in year_ages.columns:
    if col != 'workers':
        line, = plt.plot(year_ages[col], marker='o', label = str.capitalize(col.replace('_',' ').replace('min','time')),
                         linewidth = 3)
        plt.legend(handles = [plt.plot([],ls="-", color=line.get_color())[0]],
                   labels=[line.get_label()])
#end

plt.xlabel('Year', size = 20)
plt.ylabel('Average Age (years)', size = 20)
ax.legend(labels = labels, handles  = "-", 
          fontsize = 25, loc='center left', bbox_to_anchor=(1, 0.5))
ax.tick_params(axis = 'both', labelsize = 20);
# help from: https://stackoverflow.com/questions/48391146/change-marker-in-the-legend-in-matplotlib

In [None]:
counties = []
states = []

for county in data.index:
    county_name = county.split(', ')[0]
    state       = county.split(', ')[1]
    counties.append(county_name)
    states.append(state)
#end

In [None]:
data['county_name'] = counties
data['state'] = states

In [None]:
data.insert(len(data.columns)-1, 'state', states)

In [None]:
data

In [None]:
data.to_csv('../data/cleaned_data/merged.csv')