In [1]:
import pandas as pd
from uszipcode import SearchEngine

search = SearchEngine(SearchEngine.SimpleOrComprehensiveArgEnum.comprehensive)



In [2]:
from tqdm.auto import tqdm
tqdm.pandas()

# Combine & Validify data

In [3]:
citystate = {
    'Chicago': 'IL',
    'Denver': 'CO',
    'Des Moines': 'IA',
}

In [6]:
categories = ['bike_rental', 'parking', 'public_transport', 'schools', 'shop']
dfs = dict()

for category in categories:
    dfs[category] = pd.read_csv(f'./data/raw/{category}.csv')

In [7]:
for cat in categories:
    if 'Unnamed: 0' in dfs[cat].columns:
        dfs[cat].drop(columns=['Unnamed: 0'], inplace=True)
    dfs[cat] = dfs[cat].dropna().astype({'Zipcode': int})

In [8]:
def is_correct_city(row):
    if row['City'] not in citystate.keys():
        return False
    return search.by_zipcode(row['Zipcode']).state == citystate[row['City']]

In [9]:
for cat in categories:
    print(f'{cat}')
    dfs[cat]['is_valid'] = dfs[cat].progress_apply(is_correct_city, axis=1)

bike_rental


  0%|          | 0/1752 [00:00<?, ?it/s]

parking


  0%|          | 0/312 [00:00<?, ?it/s]

public_transport


  0%|          | 0/36790 [00:00<?, ?it/s]

schools


  0%|          | 0/1094 [00:00<?, ?it/s]

shop


  0%|          | 0/9896 [00:00<?, ?it/s]

In [10]:
for cat in categories:
    print(cat)
    display(dfs[cat].groupby('is_valid').count())

bike_rental


Unnamed: 0_level_0,City,Lat,Lon,Zipcode
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
True,1752,1752,1752,1752


parking


Unnamed: 0_level_0,City,Lat,Lon,Zipcode
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,12,12,12,12
True,300,300,300,300


public_transport


Unnamed: 0_level_0,City,Lat,Lon,Zipcode
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,366,366,366,366
True,36424,36424,36424,36424


schools


Unnamed: 0_level_0,City,Lat,Lon,Zipcode
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,14,14,14,14
True,1080,1080,1080,1080


shop


Unnamed: 0_level_0,City,Lat,Lon,Zipcode
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,132,132,132,132
True,9764,9764,9764,9764


In [11]:
for cat in categories:
    print(cat)
    dfs[cat].drop(dfs[cat][dfs[cat]['is_valid'] == False].index, inplace=True)
    dfs[cat].drop(columns=['is_valid'], inplace=True)

bike_rental
parking
public_transport
schools
shop


In [25]:
main_df = pd.DataFrame()
for cat in sorted(categories, key=lambda x: len(dfs[x].groupby('Zipcode')), reverse=True):
    main_df[cat] = dfs[cat].groupby('Zipcode').size()

In [26]:
main_df.fillna(0, inplace=True)
main_df = main_df.astype(int)
main_df.head()

Unnamed: 0_level_0,public_transport,shop,schools,bike_rental,parking
Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
50309,196,100,0,24,2
50310,6,86,0,0,0
50311,46,34,0,8,0
50312,52,44,0,2,2
50313,46,10,0,0,0


In [30]:
len(main_df), len(main_df[main_df['bike_rental'] > 0]), len(dfs['bike_rental'].groupby('Zipcode'))

(115, 72, 73)

In [28]:
main_df.describe()

Unnamed: 0,public_transport,shop,schools,bike_rental,parking
count,115.0,115.0,115.0,115.0,115.0
mean,316.730435,84.452174,9.391304,15.217391,2.591304
std,354.882272,102.048857,11.425346,18.595785,5.964656
min,2.0,0.0,0.0,0.0,0.0
25%,49.0,21.0,0.0,0.0,0.0
50%,194.0,58.0,4.0,8.0,0.0
75%,495.0,99.0,14.0,25.0,2.0
max,1650.0,594.0,46.0,68.0,44.0


# Gather population density

In [38]:
def get_pop(zipcode):
    return search.by_zipcode(zipcode).population_density

In [39]:
main_df['population_density'] = main_df.index.to_series().apply(get_pop)

In [40]:
main_df.head()

Unnamed: 0_level_0,public_transport,shop,schools,bike_rental,parking,population_density
Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
50309,196,100,0,24,2,1914.0
50310,6,86,0,0,0,3752.0
50311,46,34,0,8,0,6011.0
50312,52,44,0,2,2,2701.0
50313,46,10,0,0,0,963.0


# Gather cities

In [45]:
citystate = {
    'Chicago': 'IL',
    'Denver': 'CO',
    'Des Moines': 'IA',
}
statecity = {v: k for k, v in citystate.items()}

In [46]:
def get_city(zipcode):
    return statecity[search.by_zipcode(zipcode).state_abbr]

In [49]:
main_df['city'] = main_df.index.to_series().apply(get_city)

# Export data

In [51]:
main_df.to_csv('./data/processed/main.csv')

In [42]:
for cat in categories:
    dfs[cat].to_csv(f'./data/processed/{cat}.csv', index=False)