<h1><center> Data Cleaning File  <br><br> 
<font color='grey'> Cleaning Emissions and Traffic Data <br><br>


Traffic Data

In [34]:
#setting library
import requests
import os
import json
import pandas as pd
import numpy as np

In [41]:
def traffic_query(coord_list):
    """
    takes in a list of captial bikeshare coordinates
    to tell API what data to retrieve
    """
    
    #extract coordinates
    long = coord_list[0] #UPDATE LATER AFTER SEEING HOW IBADAT SET IT UP
    lat = coord_list[1]
    
    #run query
    B_URL = "https://gis.mwcog.org/wa/rest/services/RTDC/Traffic_Counts_Annual/MapServer/0/query?" #base url
    
    response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "STATION,COUNTY,AADT2013,AADT2016, AADT2019,XCOORD,YCOORD", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )
    
    #check if successful
    if response.status_code == 200:
        return response.json()
    else:
        return "Query Failed"

In [42]:
def clean_traffic(geo_json, item_len):
    """
    function to extract columns of interest from traffic jsons
    takes in the json created in traffic_query() as well as the length
    of response.json()["features"]
    """
    #storing
    temp = []
    
    for item in range(item_len):
        obs = {
                "id": geo_json["features"][item]["attributes"]["STATION"],
                "x": geo_json["features"][item]["attributes"]["XCOORD"],
                "y": geo_json["features"][item]["attributes"]["YCOORD"],
                "county": geo_json["features"][item]["attributes"]["COUNTY"],
                "2013": geo_json["features"][item]["attributes"]["AADT2013"],
                "2016": geo_json["features"][item]["attributes"]["AADT2016"],
                "2019": geo_json["features"][item]["attributes"]["AADT2019"]
            }
        temp.append(obs)    
        
    #return 
    return(temp)
    

In [43]:
#read open capital bikeshare file
coord_df = pd.read_csv("../data/raw_data/opened_capital_bikes.csv")

In [44]:
#filter for needed columns
coord_df = coord_df.filter(["STATION_ID", "LATITUDE", "LONGITUDE"])

In [45]:
#rename columns
coord_df = coord_df.rename(columns = {"STATION_ID": "cb_station", "LATITUDE": "lat", "LONGITUDE": "long"})

In [46]:
#create empty dataframe for storage
traffic_df = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])

In [6]:
#script to get average traffic volume for all opened capital bike shares
for xy in range(len(coord_df)):
    #runs query for coordinates
    test_json = traffic_query([coord_df["long"][xy], coord_df["lat"][xy]])
    
    #adds if statement in case query fails or returns no coordinates
    if test_json == "Query Failed" or len(test_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to traffic_df
        traffic_df = pd.concat([traffic_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test_clean = clean_traffic(test_json, len(test_json["features"]))
    #converts to pandas and filters for where there is data for both 2013 and 2019
    test_df = pd.DataFrame(test_clean)
    test_df = test_df[(~test_df['2013'].isna()) & (~test_df['2019'].isna())]
    
    #adds if-else statement 
    if len(test_df) == 0: #in case there is no row with data for both
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to traffic_df
        traffic_df = pd.concat([traffic_df, append], ignore_index = True)
        continue
    elif len(test_df) == 1: #if there is only one row and we can't compute SEM
        test_df = (test_df.assign(change_AADT = test_df["2013"] - test_df["2019"]).
         filter(["change_AADT"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_AADT').
         rename(columns = {"mean" : "change_AADT_mean",
                           "sem" : "change_AADT_sem"}).
         reset_index(drop=True)
         )
        test_df['change_AADT_sem'] = np.nan
    else:
        #enough rows to compute both mean change in traffic volume & standard error
        test_df = (test_df.assign(change_AADT = test_df["2013"] - test_df["2019"]).
         filter(["change_AADT"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_AADT').
         rename(columns = {"mean" : "change_AADT_mean",
                           "sem" : "change_AADT_sem"}).
         reset_index(drop=True)
         )
    #adds station and search coordinates
    test_df['station'] = coord_df["cb_station"][xy]
    test_df['long'] = coord_df["long"][xy]
    test_df['lat'] = coord_df["lat"][xy]
    #reorder cols to match storage dataframe
    test_df = test_df[['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat']]
    #add to storage dataframe
    traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)
    

  traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)
  traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)


In [16]:
#saving traffic data for opened capital bikes
#traffic_df.to_csv("opened_cb_traffic.csv")

In [35]:
#reading unopened capital bikeshare files (proposed bikeshares)
unopened_df = pd.read_excel("../data/raw_data/unopened_capital_bikes_proposed.xlsx")

In [36]:
#filter for needed columns
unopened_df = unopened_df.filter(["FID", "x", "y"])

In [37]:
#rename columns
unopened_df = unopened_df.rename(columns = {"FID": "cb_station", "y": "lat", "x": "long"})

In [47]:
#create empty dataframe for storage
unopened_traffic_df = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])

In [48]:
#script to get average traffic volume for all unopened capital bike shares
for xy in range(len(unopened_df)):
    #runs query for coordinates
    test_json = traffic_query([unopened_df["long"][xy], unopened_df["lat"][xy]])
    
    #adds if statement in case query fails or returns no coordinates
    if test_json == "Query Failed" or len(test_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = unopened_df["cb_station"][xy]
        append['long'] = unopened_df["long"][xy]
        append['lat'] = unopened_df["lat"][xy]
        #append it to traffic_df
        unopened_traffic_df = pd.concat([unopened_traffic_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test_clean = clean_traffic(test_json, len(test_json["features"]))
    #converts to pandas and filters for where there is data for both 2013 and 2019
    test_df = pd.DataFrame(test_clean)
    test_df = test_df[(~test_df['2013'].isna()) & (~test_df['2019'].isna())]
    
    #adds if-else statement 
    if len(test_df) == 0: #in case there is no row with data for both
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = unopened_df["cb_station"][xy]
        append['long'] = unopened_df["long"][xy]
        append['lat'] = unopened_df["lat"][xy]
        #append it to traffic_df
        traffic_df = pd.concat([unopened_traffic_df, append], ignore_index = True)
        continue
    elif len(test_df) == 1: #if there is only one row and we can't compute SEM
        test_df = (test_df.assign(change_AADT = test_df["2013"] - test_df["2019"]).
         filter(["change_AADT"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_AADT').
         rename(columns = {"mean" : "change_AADT_mean",
                           "sem" : "change_AADT_sem"}).
         reset_index(drop=True)
         )
        test_df['change_AADT_sem'] = np.nan
    else:
        #enough rows to compute both mean change in traffic volume & standard error
        test_df = (test_df.assign(change_AADT = test_df["2013"] - test_df["2019"]).
         filter(["change_AADT"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_AADT').
         rename(columns = {"mean" : "change_AADT_mean",
                           "sem" : "change_AADT_sem"}).
         reset_index(drop=True)
         )
    #adds station and search coordinates
    test_df['station'] = unopened_df["cb_station"][xy]
    test_df['long'] = unopened_df["long"][xy]
    test_df['lat'] = unopened_df["lat"][xy]
    #reorder cols to match storage dataframe
    test_df = test_df[['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat']]
    #add to storage dataframe
    unopened_traffic_df = pd.concat([unopened_traffic_df, test_df], ignore_index = True)
    

  unopened_traffic_df = pd.concat([unopened_traffic_df, test_df], ignore_index = True)
  unopened_traffic_df = pd.concat([unopened_traffic_df, test_df], ignore_index = True)


In [50]:
#saving traffic data for unopened capital bikes
#unopened_traffic_df.to_csv("unopened_cb_traffic.csv")

Crime Data

In [30]:
def crime_query(coord_list, year):
    """
    takes in a list of captial bikeshare coordinates
    to tell API what data to retrieve
    and the year to indicate which API to utilize
    """
    
    #extract coordinates
    long = coord_list[0] #UPDATE LATER AFTER SEEING HOW IBADAT SET IT UP
    lat = coord_list[1]
    
    #run query
    if year == 2019:
        B_URL = "https://maps2.dcgis.dc.gov/dcgis/rest/services/FEEDS/MPD/MapServer/10/query?" #base url for 2019
    elif year == 2013:
        B_URL = "https://maps2.dcgis.dc.gov/dcgis/rest/services/FEEDS/MPD/MapServer/1/query?" #base url for 2013
    
    response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "CCN,OFFENSE,WARD,METHOD,SHIFT,LONGITUDE,LATITUDE", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )
    
    #check if successful
    if response.status_code == 200:
        return response.json()
    else:
        return "Query Failed"

In [31]:
def clean_crime(geo_json, item_len, year):
    """
    function to extract columns of interest from crime jsons
    takes in the json created in crime_query() as well as the length
    of response.json()["features"]. also a int indicating year of crime api
    """
    #storing
    temp = []
    
    for item in range(item_len):
        obs = {
                "id": geo_json["features"][item]["attributes"]["CCN"],
                "x": geo_json["features"][item]["attributes"]["LONGITUDE"],
                "y": geo_json["features"][item]["attributes"]["LATITUDE"],
                "ward": geo_json["features"][item]["attributes"]["WARD"],
                "method": geo_json["features"][item]["attributes"]["METHOD"],
                "shift": geo_json["features"][item]["attributes"]["SHIFT"],
                "offense": geo_json["features"][item]["attributes"]["OFFENSE"],
                "year": year
            }
        temp.append(obs)    
        
    #return 
    return(temp)

In [32]:
#create empty dataframe for storage
crime_df = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])

In [33]:
#script to get average crime for all opened capital bike shares
for xy in range(len(coord_df)):
    #runs query for coordinates
    test13_json = crime_query([coord_df["long"][xy], coord_df["lat"][xy]], 2013)
    test19_json = crime_query([coord_df["long"][xy], coord_df["lat"][xy]], 2019)
    
    #adds if statement in case query fails or returns no coordinates
    if test13_json == "Query Failed" or test19_json == "Query Failed" or len(test13_json["features"]) == 0 or len(test19_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to crime_df
        crime_df = pd.concat([crime_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test13_clean = clean_crime(test13_json, len(test13_json["features"]), 2013)
    test19_clean = clean_crime(test19_json, len(test19_json["features"]), 2019)
    
    #converts to pandas
    test13_df = pd.DataFrame(test13_clean)
    test19_df = pd.DataFrame(test19_clean)
    
    #compute crime count
    test13_df = test13_df.assign(crime_sum = len(test13_df)).filter(['crime_sum']).drop_duplicates()
    test13_df['station'] = coord_df["cb_station"][xy]
    test19_df = test19_df.assign(crime_sum = len(test19_df)).filter(['crime_sum']).drop_duplicates()
    test19_df['station'] = coord_df["cb_station"][xy]

    #rename cols
    test13_df = test13_df.rename(columns={col: col + '_13' for col in test13_df.columns if col != 'station'})
    test19_df = test19_df.rename(columns={col: col + '_19' for col in test19_df.columns if col != 'station'})
    
    #merge df
    merged_df = pd.merge(test13_df, test19_df, on='station', how='outer')
    
    #filters for where there is data for both 2013 and 2019
    merged_df = merged_df[(~merged_df['crime_sum_13'].isna()) & (~merged_df['crime_sum_19'].isna())]
    
    #adds if-else statement 
    if len(merged_df) == 0: #in case there is no row with data for both
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to crime_df
        crime_df = pd.concat([crime_df, append], ignore_index = True)
        continue
    elif len(merged_df) == 1: #if there is only one row and we can't compute SEM
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
        merged_df['change_CRIME_sem'] = np.nan
    else:
        #enough rows to compute both mean change in crime volume & standard error
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
    #adds station and search coordinates
    merged_df['station'] = coord_df["cb_station"][xy]
    merged_df['long'] = coord_df["long"][xy]
    merged_df['lat'] = coord_df["lat"][xy]
    #reorder cols to match storage dataframe
    merged_df = merged_df[['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat']]
    #add to storage dataframe
    crime_df = pd.concat([crime_df, merged_df], ignore_index = True)
    

  crime_df = pd.concat([crime_df, append], ignore_index = True)
  crime_df = pd.concat([crime_df, merged_df], ignore_index = True)


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [15]:
#saving crime data for opened capital bikes
#crime_df.to_csv("opened_cb_crime.csv")

Unnamed: 0,station,change_CRIME_mean,change_CRIME_sem,long,lat
0,83fa4699-b2ee-4f14-af3b-167f34a62fb4,,,-77.221807,38.904415
1,e07eed7c-2d2a-4f13-aff7-2be6500958f7,4.0,,-77.095513,38.916561
2,08249ef2-1f3f-11e7-bf6b-3863bb334450,205.0,,-77.041779,38.905067
3,082544b7-1f3f-11e7-bf6b-3863bb334450,-2.0,,-77.077078,38.943837
4,082524a2-1f3f-11e7-bf6b-3863bb334450,,,-77.086063,38.893237
...,...,...,...,...,...
388,c778c501-ba47-479b-b652-831432ffd74f,-75.0,,-77.005497,38.824481
389,08249cd3-1f3f-11e7-bf6b-3863bb334450,-7.0,,-77.046615,38.896114
390,0825b983-1f3f-11e7-bf6b-3863bb334450,-11.0,,-77.015360,38.907333
391,08251bba-1f3f-11e7-bf6b-3863bb334450,,,-77.107735,38.876393


In [38]:
#create empty dataframe for storage
unopened_crime_df = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])

In [39]:
#script to get average crime for all unopened capital bike shares
for xy in range(len(unopened_df)):
    #runs query for coordinates
    test13_json = crime_query([unopened_df["long"][xy], unopened_df["lat"][xy]], 2013)
    test19_json = crime_query([unopened_df["long"][xy], unopened_df["lat"][xy]], 2019)
    
    #adds if statement in case query fails or returns no coordinates
    if test13_json == "Query Failed" or test19_json == "Query Failed" or len(test13_json["features"]) == 0 or len(test19_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = unopened_df["cb_station"][xy]
        append['long'] = unopened_df["long"][xy]
        append['lat'] = unopened_df["lat"][xy]
        #append it to crime_df
        unopened_crime_df = pd.concat([unopened_crime_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test13_clean = clean_crime(test13_json, len(test13_json["features"]), 2013)
    test19_clean = clean_crime(test19_json, len(test19_json["features"]), 2019)
    
    #converts to pandas
    test13_df = pd.DataFrame(test13_clean)
    test19_df = pd.DataFrame(test19_clean)
    
    #compute crime count
    test13_df = test13_df.assign(crime_sum = len(test13_df)).filter(['crime_sum']).drop_duplicates()
    test13_df['station'] = coord_df["cb_station"][xy]
    test19_df = test19_df.assign(crime_sum = len(test19_df)).filter(['crime_sum']).drop_duplicates()
    test19_df['station'] = coord_df["cb_station"][xy]

    #rename cols
    test13_df = test13_df.rename(columns={col: col + '_13' for col in test13_df.columns if col != 'station'})
    test19_df = test19_df.rename(columns={col: col + '_19' for col in test19_df.columns if col != 'station'})
    
    #merge df
    merged_df = pd.merge(test13_df, test19_df, on='station', how='outer')
    
    #filters for where there is data for both 2013 and 2019
    merged_df = merged_df[(~merged_df['crime_sum_13'].isna()) & (~merged_df['crime_sum_19'].isna())]
    
    #adds if-else statement 
    if len(merged_df) == 0: #in case there is no row with data for both
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = unopened_df["cb_station"][xy]
        append['long'] = unopened_df["long"][xy]
        append['lat'] = unopened_df["lat"][xy]
        #append it to crime_df
        unopened_crime_df = pd.concat([unopened_crime_df, append], ignore_index = True)
        continue
    elif len(merged_df) == 1: #if there is only one row and we can't compute SEM
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
        merged_df['change_CRIME_sem'] = np.nan
    else:
        #enough rows to compute both mean change in crime volume & standard error
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
    #adds station and search coordinates
    merged_df['station'] = unopened_df["cb_station"][xy]
    merged_df['long'] = unopened_df["long"][xy]
    merged_df['lat'] = unopened_df["lat"][xy]
    #reorder cols to match storage dataframe
    merged_df = merged_df[['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat']]
    #add to storage dataframe
    unopened_crime_df = pd.concat([unopened_crime_df, merged_df], ignore_index = True)
    

  unopened_crime_df = pd.concat([unopened_crime_df, merged_df], ignore_index = True)


In [None]:
#saving crime data for unopened capital bikes
#unopened_crime_df.to_csv("unopened_cb_crime.csv")