Parse all the data

In [29]:
import os
import pandas as pd
from pprint import pprint
import numpy as np

In [30]:
# load in all the data in Datasets directory
folder_name = "Datasets"
csv_files = [f for f in os.listdir(folder_name) if f.endswith(".csv")]
dfs = {}
for file in csv_files:
    dfs[file.replace(".csv", "")] = pd.read_csv(os.path.join(folder_name, file))

In [31]:
# get the dataset by index
def get_i_dataset(i):
    return dfs[list(dfs.keys())[i]]


# print first 5 rows of the dataset
def print_dataset(i):
    print(list(dfs.keys())[i])
    print(dfs[list(dfs.keys())[i]].head())

'''
convert latitude and longitilde data to np array of location pairs  
'''
def convert_to_numpy(df):
    df[["Latitude", "Longitude"]] = df[["Latitude", "Longitude"]].apply(pd.to_numeric, errors='coerce')
    df = df.dropna()
    np_array = df[["Latitude", "Longitude"]].to_numpy()
    assert np_array.shape[1] == 2
    assert np_array.dtype == np.float64
    return np_array, df


print(dfs.keys())

dict_keys(['arrests', 'bike_stations', 'crash_data', 'firearm_seizure_data', 'housing', 'neighborhood_area', 'parking_meter_locations', 'park_and_ride_locations', 'PGHSNAP'])


In [32]:
dfs['neighborhood_area'].head()

Unnamed: 0,Latitude,Longitude,Neighborhood,acres,sqmiles
0,40.452387,-79.907319,Point Breeze North,193.229239,0.301921
1,40.44273,-79.943582,Squirrel Hill North,782.981547,1.223409
2,40.467242,-79.94337,Garfield,292.726125,0.457385
3,40.451237,-79.97458,Bedford Dwellings,112.431551,0.175674
4,40.410936,-79.993174,Knoxville,191.759777,0.299625


In [33]:
NUM_OF_NEIGHBORHOODS = 90


def drop_na_check(df):
    df = df.dropna()
    assert len(df) == NUM_OF_NEIGHBORHOODS
    return df


def print_na(df):
    # print row containing NaN
    print(df[df.isna().any(axis=1)])

In [34]:
# construct new dataset with all data
df_output = dfs["neighborhood_area"][["Neighborhood", "Latitude", "Longitude"]]
df_output = drop_na_check(df_output)
df_output.sort_values(by=["Neighborhood"], inplace=True)
df_output = df_output.reset_index(drop=True)
neighborhood_loc, _ = convert_to_numpy(df_output)

In [35]:
def nearst_neighborhood(node):
    dist_2 = np.sum((neighborhood_loc - node) ** 2, axis=1)
    return np.argmin(dist_2)


def append_to_df_output(df, column_name, new_column_name):
    location, df = convert_to_numpy(df)
    neighborhoods = np.zeros(NUM_OF_NEIGHBORHOODS)
    for i, loc in enumerate(location):
        if not column_name:
            neighborhoods[nearst_neighborhood(loc)] += 1
        else:
            neighborhoods[nearst_neighborhood(loc)] += df.at[i, column_name]

    df_output[new_column_name] = neighborhoods

In [36]:
df_output

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Allegheny Center,40.451577,-80.005340
1,Allegheny West,40.450784,-80.014403
2,Allentown,40.419192,-79.992731
3,Arlington,40.413833,-79.963270
4,Arlington Heights,40.416880,-79.961521
...,...,...,...
85,Upper Lawrenceville,40.481811,-79.947774
86,West End,40.440883,-80.035707
87,West Oakland,40.441244,-79.962611
88,Westwood,40.432010,-80.053037


In [37]:
# adding arrests data
df = dfs['arrests']['Neighborhood'].sort_values()
# squash all row with the same name and make a new column "count"
df = df.groupby(df).size().reset_index(name='arrests_count')
df_temp = pd.merge(df_output, df, on='Neighborhood', how='left')
mt_oliver_count = df[df['Neighborhood'] == 'Mt. Oliver Neighborhood']['arrests_count'].sum()
mt_oliver_count += df[df['Neighborhood'] == 'Mount Oliver']['arrests_count'].sum()
df_temp.at[df_temp.index[df_temp['Neighborhood'] == 'Mt. Oliver'][0], 'arrests_count'] = mt_oliver_count
df_output = drop_na_check(df_temp)

In [38]:
# bike_stations
df = dfs['bike_stations']

In [39]:
# bike_stations
append_to_df_output(df, 'Total Docks', 'bike_station_count')

In [40]:
df_output

Unnamed: 0,Neighborhood,Latitude,Longitude,arrests_count,bike_station_count
0,Allegheny Center,40.451577,-80.005340,1242.0,42.0
1,Allegheny West,40.450784,-80.014403,129.0,38.0
2,Allentown,40.419192,-79.992731,945.0,0.0
3,Arlington,40.413833,-79.963270,309.0,0.0
4,Arlington Heights,40.416880,-79.961521,154.0,23.0
...,...,...,...,...,...
85,Upper Lawrenceville,40.481811,-79.947774,189.0,19.0
86,West End,40.440883,-80.035707,289.0,0.0
87,West Oakland,40.441244,-79.962611,324.0,41.0
88,Westwood,40.432010,-80.053037,226.0,0.0


In [41]:
# crash_data
df = dfs['crash_data'].rename(columns={'DEC_LAT': 'Latitude', 'DEC_LONG': 'Longitude'})
append_to_df_output(df, '', 'crash_count')

In [42]:
df_output

Unnamed: 0,Neighborhood,Latitude,Longitude,arrests_count,bike_station_count,crash_count
0,Allegheny Center,40.451577,-80.005340,1242.0,42.0,0.0
1,Allegheny West,40.450784,-80.014403,129.0,38.0,0.0
2,Allentown,40.419192,-79.992731,945.0,0.0,0.0
3,Arlington,40.413833,-79.963270,309.0,0.0,0.0
4,Arlington Heights,40.416880,-79.961521,154.0,23.0,0.0
...,...,...,...,...,...,...
85,Upper Lawrenceville,40.481811,-79.947774,189.0,19.0,0.0
86,West End,40.440883,-80.035707,289.0,0.0,0.0
87,West Oakland,40.441244,-79.962611,324.0,41.0,0.0
88,Westwood,40.432010,-80.053037,226.0,0.0,0.0


In [43]:
print_dataset(3)

firearm_seizure_data
           Neighborhood   Latitude  Longitude
0  Mount Oliver Borough  40.417619 -79.986436
1              Sheraden  40.456904 -80.052328
2       Terrace Village  40.441185 -79.968527
3      Brighton Heights  40.479530 -80.033477
4                   NaN        NaN        NaN


In [44]:
# firearm seizure
df = dfs['firearm_seizure_data']['Neighborhood']
df = df.groupby(df).size().reset_index(name='firearm_seizure_count')

df_temp = pd.merge(df_output, df, on='Neighborhood', how='left')
df_temp.fillna(0, inplace=True)
df_output = drop_na_check(df_temp)

In [45]:
# print_dataset(4)
print(get_i_dataset(4).columns)

Index(['Neighborhood_2010_INTPTLAT10', 'Neighborhood_2010_INTPTLON10',
       'Neighborhood_2010_HOOD', 'Neighborhood_2010_SQMILES',
       'SNAP_All_csv_Median_Home_Value_', 'Med__Val____00_in__10_Dollars_',
       'SNAP_All_csv_Median_Home__Value', 'SNAP_All_csv_Median_Sale_Price_',
       'SNAP_All_csv_2009_Median_Income', 'F2009_Med__Income___13_Dollars_',
       'Est__Percent_Under_Poverty__201', 'SNAP_All_csv__Part_1__Major_Cri',
       'SNAP_All_csv_Part_1_Crime_per_1', 'SNAP_All_csv_Part_2_Crime_per_1',
       'SNAP_All_csv__Murder__2010_', 'SNAP_All_csv__Rape__2010_',
       'SNAP_All_csv__Robbery__2010_', 'F_Agr__Assault__2010_',
       'SNAP_All_csv__Burglary__2010_', 'SNAP_All_csv__Auto_Theft__2010_',
       'SNAP_All_csv__Drug_Violations__', 'SNAP_All_csv___Good___Excellent',
       'SNAP_All_csv___Average_Conditio', 'SNAP_All_csv___Poor___Derelict_',
       'SNAP_All_csv_Landslide_Prone___', 'SNAP_All_csv_Flood_Plain____lan',
       'SNAP_All_csv_Park_Space____of_l', 'SNA

In [46]:
# housing
df = dfs['housing'].rename(columns={'Neighborhood_2010_INTPTLAT10': 'Latitude', 'Neighborhood_2010_INTPTLON10': 'Longitude'})
df = df[['Neighborhood_2010_HOOD', 'SNAP_All_csv_Median_Sale_Price_', 'SNAP_All_csv_2009_Median_Income', 'Est__Percent_Under_Poverty__201', 'SNAP_All_csv__Part_1__Major_Cri', 'SNAP_All_csv_Park_Space____of_l']]
df.columns = ['Neighborhood', 'Median_Sale_Price', 'Median_Income', 'Under_Poverty %', 'Major Crime Count', 'Park Space %']

df_temp = pd.merge(df_output, df, on='Neighborhood', how='left')
df_temp.fillna(0, inplace=True)
df_output = drop_na_check(df_temp)

In [49]:
df_output
df_output.to_csv(folder_name+'/post_processed.csv', index=False)

In [None]:
print_dataset(5)

In [None]:
print_dataset(6)

In [None]:
# parking rate
# latitude, longitude, rate, max_hours (omit for now)
parking_loc, df = convert_to_numpy(get_i_dataset(6))
parking_rate = df["rate"].to_numpy()
assert len(parking_rate) == parking_loc.shape[0]

In [None]:
print_dataset(7)

In [None]:
# park and ride location
# latitude, longitude
park_and_ride_loc, _ = convert_to_numpy(get_i_dataset(7))

In [None]:
# print_dataset(8)
get_i_dataset(8).columns

In [None]:
# PGHSNAP
df = get_i_dataset(8).dropna()
PGHSNAP_neighborhood_name = df["Neighborhood"].to_numpy()
PGHSNAP_population = df["Population (2010)"].to_numpy()
assert PGHSNAP_population.dtype == np.int64
PGHSNAP_street_density = df["Street Density (st. mi/area sq. mi)"].to_numpy()
assert PGHSNAP_street_density.dtype == np.float64
PGHSNAP_working_population = df["Total Working Pop. (Age 16+) (2010)"].to_numpy()
assert PGHSNAP_working_population.dtype == np.int64
PGHSNAP_work_at_home_percentage = df["Work at Home (2010)"].to_numpy()
assert PGHSNAP_work_at_home_percentage.dtype == np.float64