<h1><center> Data Cleaning File  <br><br> 
<font color='grey'> Cleaning Emissions and Traffic Data <br><br>


Traffic Data

In [74]:
#setting library
import requests
import os
import json
import pandas as pd
import numpy as np

In [75]:
def traffic_query(coord_list):
    """
    takes in a list of captial bikeshare coordinates
    to tell API what data to retrieve
    """
    
    #extract coordinates
    long = coord_list[0] #UPDATE LATER AFTER SEEING HOW IBADAT SET IT UP
    lat = coord_list[1]
    
    #run query
    B_URL = "https://gis.mwcog.org/wa/rest/services/RTDC/Traffic_Counts_Annual/MapServer/0/query?" #base url
    
    response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "AADT2010,AADT2011,AADT2012,AADT2013,AADT2014,AADT2015,AADT2016", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )
    
    #check if successful
    if response.status_code == 200:
        return response.json()
    else:
        return "Query Failed"

In [76]:
def clean_traffic(geo_json, item_len):
    """
    function to extract columns of interest from traffic jsons
    takes in the json created in traffic_query() as well as the length
    of response.json()["features"]
    """
    #storing
    temp = []
    
    #grabbing cols
    for item in range(item_len):
        obs = {
                "2010": geo_json["features"][item]["attributes"]["AADT2010"],
                "2011": geo_json["features"][item]["attributes"]["AADT2011"],
                "2012": geo_json["features"][item]["attributes"]["AADT2012"],
                "2013": geo_json["features"][item]["attributes"]["AADT2013"],
                "2014": geo_json["features"][item]["attributes"]["AADT2014"],
                "2015": geo_json["features"][item]["attributes"]["AADT2015"],
                "2016": geo_json["features"][item]["attributes"]["AADT2016"]
            }
        #appending to list
        temp.append(obs)    
        
    #return 
    return(temp)
    

In [77]:
#read open capital bikeshare file
coord_df = pd.read_csv("../data/raw_data/opened_capital_bikes.csv")

In [78]:
#filter for needed columns
coord_df = coord_df.filter(["STATION_ID", "LATITUDE", "LONGITUDE", "Opening Year", "Name"])

In [79]:
#rename columns
coord_df = coord_df.rename(columns = {"STATION_ID": "cb_station", "LATITUDE": "lat", "LONGITUDE": "long", "Opening Year": "open_year", "Name": "name"})

In [93]:
#create empty dataframe for storage
traffic_df = pd.DataFrame(columns = ['id', 'long', 'lat', 'open_year', 'name','2010', '2011', '2012', '2013', '2014', '2015'])

In [91]:
test_df = pd.DataFrame(test_clean)
test_df =  (test_df.agg(["mean"]).
         reset_index(drop = True)
         )

In [94]:
#script to get average traffic volume for all opened capital bike shares
for xy in range(len(coord_df)):
    #runs query for coordinates
    test_json = traffic_query([coord_df["long"][xy], coord_df["lat"][xy]])
    
    #adds if statement in case query fails or returns no coordinates
    if test_json == "Query Failed" or len(test_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns = ['id', 'long', 'lat', 'open_year', 'name','2010', '2011', '2012', '2013', '2014', '2015'])
        append.loc[len(append)] = np.nan
        append['id'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        append['open_year'] = coord_df["open_year"][xy]
        append['name'] = coord_df["name"][xy]
        #append it to traffic_df
        traffic_df = pd.concat([traffic_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test_clean = clean_traffic(test_json, len(test_json["features"]))
    #converts to pandas and filters for where there is data for both 2013 and 2019
    test_df = pd.DataFrame(test_clean)

    #gets mean traffic volume for all years
    test_df = test_df.agg(["mean"]).reset_index(drop = True)

    #adds station and search coordinates
    test_df['id'] = coord_df["cb_station"][xy]
    test_df['long'] = coord_df["long"][xy]
    test_df['lat'] = coord_df["lat"][xy]
    test_df['open_year'] = coord_df["open_year"][xy]
    test_df['name'] = coord_df["name"][xy]
    #reorder cols to match storage dataframe
    test_df = test_df[['id', 'long', 'lat', 'open_year', 'name','2010', '2011', '2012', '2013', '2014', '2015', '2016']]
    #add to storage dataframe
    traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)
    

  traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)
  traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)


In [97]:
#saving traffic data for opened capital bikes
traffic_df.to_csv("opened_cb_traffic.csv", index = False)

In [108]:
#reading unopened capital bikeshare files (proposed bikeshares)
unopened_df = pd.read_excel("../data/raw_data/unopened_capital_bikes_proposed.xlsx")

In [109]:
#filter for needed columns
unopened_df = unopened_df.filter(["FID", "x", "y", "ClosestInt"])

In [111]:
#rename columns
unopened_df = unopened_df.rename(columns = {"FID": "cb_station", "y": "lat", "x": "long", "ClosestInt": "name"})

In [113]:
#create empty dataframe for storage
unopened_traffic_df = pd.DataFrame(columns = ['id', 'long', 'lat', 'open_year', 'name','2010', '2011', '2012', '2013', '2014', '2015'])

In [114]:
#script to get average traffic volume for all unopened capital bike shares
for xy in range(len(unopened_df)):
    #runs query for coordinates
    test_json = traffic_query([unopened_df["long"][xy], unopened_df["lat"][xy]])
    
    #adds if statement in case query fails or returns no coordinates
    if test_json == "Query Failed" or len(test_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns = ['id', 'long', 'lat', 'open_year', 'name','2010', '2011', '2012', '2013', '2014', '2015'])
        append.loc[len(append)] = np.nan
        append['id'] = unopened_df["cb_station"][xy]
        append['long'] = unopened_df["long"][xy]
        append['lat'] = unopened_df["lat"][xy]
        append['name'] = unopened_df["name"][xy]
        #append it to traffic_df
        unopened_traffic_df = pd.concat([unopened_traffic_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test_clean = clean_traffic(test_json, len(test_json["features"]))
    #converts to pandas and filters for where there is data for both 2013 and 2019
    test_df = pd.DataFrame(test_clean)

    #gets mean traffic volume for all years
    test_df = test_df.agg(["mean"]).reset_index(drop = True)

    #adds station and search coordinates
    test_df['id'] = unopened_df["cb_station"][xy]
    test_df['long'] = unopened_df["long"][xy]
    test_df['lat'] = unopened_df["lat"][xy]
    test_df['open_year'] = np.nan
    test_df['name'] = unopened_df["name"][xy]
    #reorder cols to match storage dataframe
    test_df = test_df[['id', 'long', 'lat', 'open_year', 'name','2010', '2011', '2012', '2013', '2014', '2015', '2016']]
    #add to storage dataframe
    unopened_traffic_df = pd.concat([unopened_traffic_df, test_df], ignore_index = True)
    

  unopened_traffic_df = pd.concat([unopened_traffic_df, test_df], ignore_index = True)


In [116]:
unopened_traffic_df.describe()

Unnamed: 0,long,lat,2010,2011,2012,2013,2014,2015,2016
count,184.0,184.0,160.0,160.0,160.0,160.0,164.0,165.0,170.0
mean,-77.018976,38.915297,20531.187011,20415.163689,20535.039831,20592.764541,20101.722714,19087.794349,19017.483782
std,0.043765,0.035673,22079.293989,22209.478156,21965.549927,24932.146479,24641.086544,23694.170277,24022.633048
min,-77.109057,38.819474,1500.0,1417.0,1404.0,1410.0,1823.0,1883.0,1933.0
25%,-77.051052,38.893453,9364.125,9236.9375,9132.375,9563.375,8899.875,8615.333333,8920.833333
50%,-77.022431,38.915921,15481.416667,15622.916667,16122.964286,15826.916667,14956.683333,14074.0,13765.0
75%,-76.987472,38.938926,21492.90625,22581.166667,23746.375,23023.25,21767.9375,20161.0,19732.975
max,-76.919072,38.991081,175418.0,177134.0,175205.0,226169.0,226108.0,234467.0,241377.0


In [None]:
#saving traffic data for unopened capital bikes
unopened_traffic_df.to_csv("unopened_cb_traffic.csv")

Crime Data

In [None]:
def crime_query(coord_list, year):
    """
    takes in a list of captial bikeshare coordinates
    to tell API what data to retrieve
    and the year to indicate which API to utilize
    """
    
    #extract coordinates
    long = coord_list[0] #UPDATE LATER AFTER SEEING HOW IBADAT SET IT UP
    lat = coord_list[1]
    
    #run query
    if year == 2019:
        B_URL = "https://maps2.dcgis.dc.gov/dcgis/rest/services/FEEDS/MPD/MapServer/10/query?" #base url for 2019
    elif year == 2013:
        B_URL = "https://maps2.dcgis.dc.gov/dcgis/rest/services/FEEDS/MPD/MapServer/1/query?" #base url for 2013
    
    response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "CCN,OFFENSE,WARD,METHOD,SHIFT,LONGITUDE,LATITUDE", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )
    
    #check if successful
    if response.status_code == 200:
        return response.json()
    else:
        return "Query Failed"

In [None]:
def clean_crime(geo_json, item_len, year):
    """
    function to extract columns of interest from crime jsons
    takes in the json created in crime_query() as well as the length
    of response.json()["features"]. also a int indicating year of crime api
    """
    #storing
    temp = []
    
    for item in range(item_len):
        obs = {
                "id": geo_json["features"][item]["attributes"]["CCN"],
                "x": geo_json["features"][item]["attributes"]["LONGITUDE"],
                "y": geo_json["features"][item]["attributes"]["LATITUDE"],
                "ward": geo_json["features"][item]["attributes"]["WARD"],
                "method": geo_json["features"][item]["attributes"]["METHOD"],
                "shift": geo_json["features"][item]["attributes"]["SHIFT"],
                "offense": geo_json["features"][item]["attributes"]["OFFENSE"],
                "year": year
            }
        temp.append(obs)    
        
    #return 
    return(temp)

In [None]:
#create empty dataframe for storage
crime_df = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])

In [None]:
#script to get average crime for all opened capital bike shares
for xy in range(len(coord_df)):
    #runs query for coordinates
    test13_json = crime_query([coord_df["long"][xy], coord_df["lat"][xy]], 2013)
    test19_json = crime_query([coord_df["long"][xy], coord_df["lat"][xy]], 2019)
    
    #adds if statement in case query fails or returns no coordinates
    if test13_json == "Query Failed" or test19_json == "Query Failed" or len(test13_json["features"]) == 0 or len(test19_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to crime_df
        crime_df = pd.concat([crime_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test13_clean = clean_crime(test13_json, len(test13_json["features"]), 2013)
    test19_clean = clean_crime(test19_json, len(test19_json["features"]), 2019)
    
    #converts to pandas
    test13_df = pd.DataFrame(test13_clean)
    test19_df = pd.DataFrame(test19_clean)
    
    #compute crime count
    test13_df = test13_df.assign(crime_sum = len(test13_df)).filter(['crime_sum']).drop_duplicates()
    test13_df['station'] = coord_df["cb_station"][xy]
    test19_df = test19_df.assign(crime_sum = len(test19_df)).filter(['crime_sum']).drop_duplicates()
    test19_df['station'] = coord_df["cb_station"][xy]

    #rename cols
    test13_df = test13_df.rename(columns={col: col + '_13' for col in test13_df.columns if col != 'station'})
    test19_df = test19_df.rename(columns={col: col + '_19' for col in test19_df.columns if col != 'station'})
    
    #merge df
    merged_df = pd.merge(test13_df, test19_df, on='station', how='outer')
    
    #filters for where there is data for both 2013 and 2019
    merged_df = merged_df[(~merged_df['crime_sum_13'].isna()) & (~merged_df['crime_sum_19'].isna())]
    
    #adds if-else statement 
    if len(merged_df) == 0: #in case there is no row with data for both
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to crime_df
        crime_df = pd.concat([crime_df, append], ignore_index = True)
        continue
    elif len(merged_df) == 1: #if there is only one row and we can't compute SEM
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
        merged_df['change_CRIME_sem'] = np.nan
    else:
        #enough rows to compute both mean change in crime volume & standard error
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
    #adds station and search coordinates
    merged_df['station'] = coord_df["cb_station"][xy]
    merged_df['long'] = coord_df["long"][xy]
    merged_df['lat'] = coord_df["lat"][xy]
    #reorder cols to match storage dataframe
    merged_df = merged_df[['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat']]
    #add to storage dataframe
    crime_df = pd.concat([crime_df, merged_df], ignore_index = True)
    

In [None]:
#saving crime data for opened capital bikes
#crime_df.to_csv("opened_cb_crime.csv")

In [None]:
#create empty dataframe for storage
unopened_crime_df = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])

In [None]:
#script to get average crime for all unopened capital bike shares
for xy in range(len(unopened_df)):
    #runs query for coordinates
    test13_json = crime_query([unopened_df["long"][xy], unopened_df["lat"][xy]], 2013)
    test19_json = crime_query([unopened_df["long"][xy], unopened_df["lat"][xy]], 2019)
    
    #adds if statement in case query fails or returns no coordinates
    if test13_json == "Query Failed" or test19_json == "Query Failed" or len(test13_json["features"]) == 0 or len(test19_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = unopened_df["cb_station"][xy]
        append['long'] = unopened_df["long"][xy]
        append['lat'] = unopened_df["lat"][xy]
        #append it to crime_df
        unopened_crime_df = pd.concat([unopened_crime_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test13_clean = clean_crime(test13_json, len(test13_json["features"]), 2013)
    test19_clean = clean_crime(test19_json, len(test19_json["features"]), 2019)
    
    #converts to pandas
    test13_df = pd.DataFrame(test13_clean)
    test19_df = pd.DataFrame(test19_clean)
    
    #compute crime count
    test13_df = test13_df.assign(crime_sum = len(test13_df)).filter(['crime_sum']).drop_duplicates()
    test13_df['station'] = coord_df["cb_station"][xy]
    test19_df = test19_df.assign(crime_sum = len(test19_df)).filter(['crime_sum']).drop_duplicates()
    test19_df['station'] = coord_df["cb_station"][xy]

    #rename cols
    test13_df = test13_df.rename(columns={col: col + '_13' for col in test13_df.columns if col != 'station'})
    test19_df = test19_df.rename(columns={col: col + '_19' for col in test19_df.columns if col != 'station'})
    
    #merge df
    merged_df = pd.merge(test13_df, test19_df, on='station', how='outer')
    
    #filters for where there is data for both 2013 and 2019
    merged_df = merged_df[(~merged_df['crime_sum_13'].isna()) & (~merged_df['crime_sum_19'].isna())]
    
    #adds if-else statement 
    if len(merged_df) == 0: #in case there is no row with data for both
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = unopened_df["cb_station"][xy]
        append['long'] = unopened_df["long"][xy]
        append['lat'] = unopened_df["lat"][xy]
        #append it to crime_df
        unopened_crime_df = pd.concat([unopened_crime_df, append], ignore_index = True)
        continue
    elif len(merged_df) == 1: #if there is only one row and we can't compute SEM
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
        merged_df['change_CRIME_sem'] = np.nan
    else:
        #enough rows to compute both mean change in crime volume & standard error
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
    #adds station and search coordinates
    merged_df['station'] = unopened_df["cb_station"][xy]
    merged_df['long'] = unopened_df["long"][xy]
    merged_df['lat'] = unopened_df["lat"][xy]
    #reorder cols to match storage dataframe
    merged_df = merged_df[['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat']]
    #add to storage dataframe
    unopened_crime_df = pd.concat([unopened_crime_df, merged_df], ignore_index = True)
    

In [None]:
#saving crime data for unopened capital bikes
#unopened_crime_df.to_csv("unopened_cb_crime.csv")