<h1><center> Data Cleaning File  <br><br> 
<font color='grey'> Cleaning Emissions and Traffic Data <br><br>


Traffic Data

In [2]:
#setting library
import requests
import os
import json
import pandas as pd
import numpy as np

In [13]:
def traffic_query(coord_list):
    """
    takes in a list of captial bikeshare coordinates
    to tell API what data to retrieve
    """
    
    #extract coordinates
    long = coord_list[0] #UPDATE LATER AFTER SEEING HOW IBADAT SET IT UP
    lat = coord_list[1]
    
    #run query
    B_URL = "https://gis.mwcog.org/wa/rest/services/RTDC/Traffic_Counts_Annual/MapServer/0/query?" #base url
    
    response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "STATION,COUNTY,AADT2013,AADT2016, AADT2019,XCOORD,YCOORD", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )
    
    #check if successful
    if response.status_code == 200:
        return response.json()
    else:
        return "Query Failed"

In [14]:
def clean_traffic(geo_json, item_len):
    """
    function to extract columns of interest from traffic jsons
    takes in the json created in traffic_query() as well as the length
    of response.json()["features"]
    """
    #storing
    temp = []
    
    for item in range(item_len):
        obs = {
                "id": geo_json["features"][item]["attributes"]["STATION"],
                "x": geo_json["features"][item]["attributes"]["XCOORD"],
                "y": geo_json["features"][item]["attributes"]["YCOORD"],
                "county": geo_json["features"][item]["attributes"]["COUNTY"],
                "2013": geo_json["features"][item]["attributes"]["AADT2013"],
                "2016": geo_json["features"][item]["attributes"]["AADT2016"],
                "2019": geo_json["features"][item]["attributes"]["AADT2019"]
            }
        temp.append(obs)    
        
    #return 
    return(temp)
    

#test coordinates  #DELETE AFTER GETTING IBADAT'S/UPDATE
coord_df = pd.DataFrame(columns=['cb_station', 'long', 'lat'])

coordinates = pd.DataFrame([
    {'cb_station': "a", 'long': -77.0334, 'lat': 38.89223},
    {'cb_station': "b", 'long': -77.0634, 'lat': 38.87223}
])

coord_df = pd.concat([coord_df, coordinates], ignore_index = True)


In [3]:
#read files
coord_df = pd.read_csv("progress_1.csv")
#filter for needed columns
coord_df = coord_df.filter(["STATION_ID", "LATITUDE", "LONGITUDE"])
#rename columns
coord_df = coord_df.rename(columns = {"STATION_ID": "cb_station", "LATITUDE": "lat", "LONGITUDE": "long"})

In [4]:
coord_df

Unnamed: 0,cb_station,lat,long
0,83fa4699-b2ee-4f14-af3b-167f34a62fb4,38.904415,-77.221807
1,e07eed7c-2d2a-4f13-aff7-2be6500958f7,38.916561,-77.095513
2,08249ef2-1f3f-11e7-bf6b-3863bb334450,38.905067,-77.041779
3,082544b7-1f3f-11e7-bf6b-3863bb334450,38.943837,-77.077078
4,082524a2-1f3f-11e7-bf6b-3863bb334450,38.893237,-77.086063
...,...,...,...
775,0825b42a-1f3f-11e7-bf6b-3863bb334450,39.076331,-77.141378
776,47ea64ba-00cd-4762-a90c-240244d1e4c8,38.896604,-77.039778
777,082498ac-1f3f-11e7-bf6b-3863bb334450,38.873057,-76.971015
778,08264250-1f3f-11e7-bf6b-3863bb334450,38.919019,-77.034449


In [16]:
#create empty dataframe for storage
traffic_df = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])

In [17]:
for xy in range(len(coord_df)):
    #runs query for coordinates
    test_json = traffic_query([coord_df["long"][xy], coord_df["lat"][xy]])
    
    #adds if statement in case query fails or returns no coordinates
    if test_json == "Query Failed" or len(test_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to traffic_df
        traffic_df = pd.concat([traffic_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test_clean = clean_traffic(test_json, len(test_json["features"]))
    #converts to pandas and filters for where there is data for both 2013 and 2019
    test_df = pd.DataFrame(test_clean)
    test_df = test_df[(~test_df['2013'].isna()) & (~test_df['2019'].isna())]
    
    #adds if-else statement 
    if len(test_df) == 0: #in case there is no row with data for both
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to traffic_df
        traffic_df = pd.concat([traffic_df, append], ignore_index = True)
        continue
    elif len(test_df) == 1: #if there is only one row and we can't compute SEM
        test_df = (test_df.assign(change_AADT = test_df["2013"] - test_df["2019"]).
         filter(["change_AADT"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_AADT').
         rename(columns = {"mean" : "change_AADT_mean",
                           "sem" : "change_AADT_sem"}).
         reset_index(drop=True)
         )
        test_df['change_AADT_sem'] = np.nan
    else:
        #enough rows to compute both mean change in traffic volume & standard error
        test_df = (test_df.assign(change_AADT = test_df["2013"] - test_df["2019"]).
         filter(["change_AADT"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_AADT').
         rename(columns = {"mean" : "change_AADT_mean",
                           "sem" : "change_AADT_sem"}).
         reset_index(drop=True)
         )
    #adds station and search coordinates
    test_df['station'] = coord_df["cb_station"][xy]
    test_df['long'] = coord_df["long"][xy]
    test_df['lat'] = coord_df["lat"][xy]
    #reorder cols to match storage dataframe
    test_df = test_df[['station', 'change_AADT_mean', 'change_AADT_sem', 'long', 'lat']]
    #add to storage dataframe
    traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)
    

  traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)
  traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)


In [18]:
traffic_df

Unnamed: 0,station,change_AADT_mean,change_AADT_sem,long,lat
0,83fa4699-b2ee-4f14-af3b-167f34a62fb4,2000.000000,,-77.221807,38.904415
1,e07eed7c-2d2a-4f13-aff7-2be6500958f7,808.500000,1217.500000,-77.095513,38.916561
2,08249ef2-1f3f-11e7-bf6b-3863bb334450,-1164.181818,917.056872,-77.041779,38.905067
3,082544b7-1f3f-11e7-bf6b-3863bb334450,-868.333333,652.764719,-77.077078,38.943837
4,082524a2-1f3f-11e7-bf6b-3863bb334450,2266.666667,866.666667,-77.086063,38.893237
...,...,...,...,...,...
775,0825b42a-1f3f-11e7-bf6b-3863bb334450,-750.000000,690.000000,-77.141378,39.076331
776,47ea64ba-00cd-4762-a90c-240244d1e4c8,-1532.500000,737.630434,-77.039778,38.896604
777,082498ac-1f3f-11e7-bf6b-3863bb334450,-808.500000,576.002676,-76.971015,38.873057
778,08264250-1f3f-11e7-bf6b-3863bb334450,-2295.222222,1158.950828,-77.034449,38.919019


Crime Data

In [19]:
def crime_query(coord_list, year):
    """
    takes in a list of captial bikeshare coordinates
    to tell API what data to retrieve
    and the year to indicate which API to utilize
    """
    
    #extract coordinates
    long = coord_list[0] #UPDATE LATER AFTER SEEING HOW IBADAT SET IT UP
    lat = coord_list[1]
    
    #run query
    if year == 2019:
        B_URL = "https://maps2.dcgis.dc.gov/dcgis/rest/services/FEEDS/MPD/MapServer/10/query?" #base url for 2019
    elif year == 2013:
        B_URL = "https://maps2.dcgis.dc.gov/dcgis/rest/services/FEEDS/MPD/MapServer/1/query?" #base url for 2013
    
    response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "CCN,OFFENSE,WARD,METHOD,SHIFT,LONGITUDE,LATITUDE", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )
    
    #check if successful
    if response.status_code == 200:
        return response.json()
    else:
        return "Query Failed"

In [20]:
def clean_crime(geo_json, item_len, year):
    """
    function to extract columns of interest from crime jsons
    takes in the json created in crime_query() as well as the length
    of response.json()["features"]. also a int indicating year of crime api
    """
    #storing
    temp = []
    
    for item in range(item_len):
        obs = {
                "id": geo_json["features"][item]["attributes"]["CCN"],
                "x": geo_json["features"][item]["attributes"]["LONGITUDE"],
                "y": geo_json["features"][item]["attributes"]["LATITUDE"],
                "ward": geo_json["features"][item]["attributes"]["WARD"],
                "method": geo_json["features"][item]["attributes"]["METHOD"],
                "shift": geo_json["features"][item]["attributes"]["SHIFT"],
                "offense": geo_json["features"][item]["attributes"]["OFFENSE"],
                "year": year
            }
        temp.append(obs)    
        
    #return 
    return(temp)

In [21]:
#create empty dataframe for storage
crime_df = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])

In [22]:
for xy in range(len(coord_df)):
    #runs query for coordinates
    test13_json = crime_query([coord_df["long"][xy], coord_df["lat"][xy]], 2013)
    test19_json = crime_query([coord_df["long"][xy], coord_df["lat"][xy]], 2019)
    
    #adds if statement in case query fails or returns no coordinates
    if test13_json == "Query Failed" or test19_json == "Query Failed" or len(test13_json["features"]) == 0 or len(test19_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to crime_df
        crime_df = pd.concat([crime_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test13_clean = clean_crime(test13_json, len(test13_json["features"]), 2013)
    test19_clean = clean_crime(test19_json, len(test19_json["features"]), 2019)
    
    #converts to pandas
    test13_df = pd.DataFrame(test13_clean)
    test19_df = pd.DataFrame(test19_clean)
    
    #compute crime count
    test13_df = test13_df.assign(crime_sum = len(test13_df)).filter(['crime_sum']).drop_duplicates()
    test13_df['station'] = coord_df["cb_station"][xy]
    test19_df = test19_df.assign(crime_sum = len(test19_df)).filter(['crime_sum']).drop_duplicates()
    test19_df['station'] = coord_df["cb_station"][xy]

    #rename cols
    test13_df = test13_df.rename(columns={col: col + '_13' for col in test13_df.columns if col != 'station'})
    test19_df = test19_df.rename(columns={col: col + '_19' for col in test19_df.columns if col != 'station'})
    
    #merge df
    merged_df = pd.merge(test13_df, test19_df, on='station', how='outer')
    
    #filters for where there is data for both 2013 and 2019
    merged_df = merged_df[(~merged_df['crime_sum_13'].isna()) & (~merged_df['crime_sum_19'].isna())]
    
    #adds if-else statement 
    if len(merged_df) == 0: #in case there is no row with data for both
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns=['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat'])
        append.loc[len(append)] = np.nan
        append['station'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        #append it to crime_df
        crime_df = pd.concat([crime_df, append], ignore_index = True)
        continue
    elif len(merged_df) == 1: #if there is only one row and we can't compute SEM
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
        merged_df['change_CRIME_sem'] = np.nan
    else:
        #enough rows to compute both mean change in crime volume & standard error
        merged_df = (merged_df.assign(change_crime = merged_df["crime_sum_13"] - merged_df["crime_sum_19"]).
         filter(["change_crime"]).
         agg(["mean", "sem"]).
         reset_index().
         pivot_table(
                index = None,
                columns = 'index',
                values = 'change_crime').
         rename(columns = {"mean" : "change_CRIME_mean",
                           "sem" : "change_CRIME_sem"}).
         reset_index(drop=True)
         )
    #adds station and search coordinates
    merged_df['station'] = coord_df["cb_station"][xy]
    merged_df['long'] = coord_df["long"][xy]
    merged_df['lat'] = coord_df["lat"][xy]
    #reorder cols to match storage dataframe
    merged_df = merged_df[['station', 'change_CRIME_mean', 'change_CRIME_sem', 'long', 'lat']]
    #add to storage dataframe
    crime_df = pd.concat([crime_df, merged_df], ignore_index = True)
    

  crime_df = pd.concat([crime_df, append], ignore_index = True)
  crime_df = pd.concat([crime_df, merged_df], ignore_index = True)


In [23]:
crime_df

Unnamed: 0,station,change_CRIME_mean,change_CRIME_sem,long,lat
0,83fa4699-b2ee-4f14-af3b-167f34a62fb4,,,-77.221807,38.904415
1,e07eed7c-2d2a-4f13-aff7-2be6500958f7,4.0,,-77.095513,38.916561
2,08249ef2-1f3f-11e7-bf6b-3863bb334450,205.0,,-77.041779,38.905067
3,082544b7-1f3f-11e7-bf6b-3863bb334450,-2.0,,-77.077078,38.943837
4,082524a2-1f3f-11e7-bf6b-3863bb334450,,,-77.086063,38.893237
...,...,...,...,...,...
775,0825b42a-1f3f-11e7-bf6b-3863bb334450,,,-77.141378,39.076331
776,47ea64ba-00cd-4762-a90c-240244d1e4c8,-5.0,,-77.039778,38.896604
777,082498ac-1f3f-11e7-bf6b-3863bb334450,-86.0,,-76.971015,38.873057
778,08264250-1f3f-11e7-bf6b-3863bb334450,-18.0,,-77.034449,38.919019


School District Poverty

In [24]:
B_URL = "http://api.census.gov/data/timeseries/poverty/saipe/schdist" #base url
    

In [None]:
response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "STATION,COUNTY,AADT2013,AADT2016, AADT2019,XCOORD,YCOORD", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )

In [None]:
def traffic_query(coord_list):
    """
    takes in a list of captial bikeshare coordinates
    to tell API what data to retrieve
    """
    
    #extract coordinates
    long = coord_list[0] #UPDATE LATER AFTER SEEING HOW IBADAT SET IT UP
    lat = coord_list[1]
    
    #run query
    B_URL = "https://gis.mwcog.org/wa/rest/services/RTDC/Traffic_Counts_Annual/MapServer/0/query?" #base url
    
    response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "STATION,COUNTY,AADT2013,AADT2016, AADT2019,XCOORD,YCOORD", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )
    
    #check if successful
    if response.status_code == 200:
        return response.json()
    else:
        return "Query Failed"