<h1><center> Data Cleaning File  <br><br> 
<font color='grey'> Cleaning Emissions and Traffic Data <br><br>


Traffic Data

In [2]:
#setting library
import requests
import os
import json
import pandas as pd
import numpy as np

In [1]:
def traffic_query(coord_list):
    """
    takes in a list of captial bikeshare coordinates
    to tell API what data to retrieve
    """
    
    #extract coordinates
    long = coord_list[0] #UPDATE LATER AFTER SEEING HOW IBADAT SET IT UP
    lat = coord_list[1]
    
    #run query
    B_URL = "https://gis.mwcog.org/wa/rest/services/RTDC/Traffic_Counts_Annual/MapServer/0/query?" #base url
    
    response = requests.get(
        B_URL,
        params = {
            "where": "1=1", #no filters
            "outFields": "AADT2007,AADT2008,AADT2009,AADT2010,AADT2011,AADT2012,AADT2013,AADT2014,AADT2015,AADT2016,AADT2017,AADT2018,AADT2019", #indicates which cols to return
            "geometry": f"{long},{lat}", #input coordinates
            "geometryType": "esriGeometryPoint", #indicates we're giving it points
            "distance": 500, #how far away from point
            "units": "esriSRUnit_Meter", #units in meters
            "inSR": "4326", #coordiante system
            "f": "json" #type of file to return
        }
    )
    
    #check if successful
    if response.status_code == 200:
        return response.json()
    else:
        return "Query Failed"

In [2]:
def clean_traffic(geo_json, item_len):
    """
    function to extract columns of interest from traffic jsons
    takes in the json created in traffic_query() as well as the length
    of response.json()["features"]
    """
    #storing
    temp = []
    
    #grabbing cols
    for item in range(item_len):
        obs = {
                "2007": geo_json["features"][item]["attributes"]["AADT2007"],
                "2008": geo_json["features"][item]["attributes"]["AADT2008"],
                "2009": geo_json["features"][item]["attributes"]["AADT2009"],
                "2010": geo_json["features"][item]["attributes"]["AADT2010"],
                "2011": geo_json["features"][item]["attributes"]["AADT2011"],
                "2012": geo_json["features"][item]["attributes"]["AADT2012"],
                "2013": geo_json["features"][item]["attributes"]["AADT2013"],
                "2014": geo_json["features"][item]["attributes"]["AADT2014"],
                "2015": geo_json["features"][item]["attributes"]["AADT2015"],
                "2016": geo_json["features"][item]["attributes"]["AADT2016"],
                "2017": geo_json["features"][item]["attributes"]["AADT2017"],
                "2018": geo_json["features"][item]["attributes"]["AADT2018"],
                "2019": geo_json["features"][item]["attributes"]["AADT2019"]
            }
        #appending to list
        temp.append(obs)    
        
    #return 
    return(temp)
    

In [5]:
#read open capital bikeshare file
coord_df = pd.read_csv("../data/raw_data/opened_capital_bikes.csv")

In [6]:
#filter for needed columns
coord_df = coord_df.filter(["STATION_ID", "LATITUDE", "LONGITUDE", "Opening Year", "Name"])

In [7]:
#rename columns
coord_df = coord_df.rename(columns = {"STATION_ID": "cb_station", "LATITUDE": "lat", "LONGITUDE": "long", "Opening Year": "open_year", "Name": "name"})

In [9]:
#create empty dataframe for storage
traffic_df = pd.DataFrame(columns = ['id', 'long', 'lat', 'open_year', 'name',
                                     '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'])

In [11]:
#script to get average traffic volume for all opened capital bike shares
for xy in range(len(coord_df)):
    #runs query for coordinates
    test_json = traffic_query([coord_df["long"][xy], coord_df["lat"][xy]])
    
    #adds if statement in case query fails or returns no coordinates
    if test_json == "Query Failed" or len(test_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns = ['id', 'long', 'lat', 'open_year', 'name',
                                     '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'])
        append.loc[len(append)] = np.nan
        append['id'] = coord_df["cb_station"][xy]
        append['long'] = coord_df["long"][xy]
        append['lat'] = coord_df["lat"][xy]
        append['open_year'] = coord_df["open_year"][xy]
        append['name'] = coord_df["name"][xy]
        #append it to traffic_df
        traffic_df = pd.concat([traffic_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test_clean = clean_traffic(test_json, len(test_json["features"]))
    #converts to pandas and filters for where there is data for both 2013 and 2019
    test_df = pd.DataFrame(test_clean)

    #gets mean traffic volume for all years
    test_df = test_df.agg(["mean"]).reset_index(drop = True)

    #adds station and search coordinates
    test_df['id'] = coord_df["cb_station"][xy]
    test_df['long'] = coord_df["long"][xy]
    test_df['lat'] = coord_df["lat"][xy]
    test_df['open_year'] = coord_df["open_year"][xy]
    test_df['name'] = coord_df["name"][xy]
    #reorder cols to match storage dataframe
    test_df = test_df[['id', 'long', 'lat', 'open_year', 'name','2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]
    #add to storage dataframe
    traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)
    

  traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)
  traffic_df = pd.concat([traffic_df, test_df], ignore_index = True)


In [13]:
#saving traffic data for opened capital bikes
traffic_df.to_csv("../data/cleaned_data/opened_cb_traffic.csv", index = False)

In [3]:
#reading unopened capital bikeshare files (proposed bikeshares)
unopened_df = pd.read_excel("../data/raw_data/unopened_capital_bikes_proposed.xlsx")

In [15]:
#filter for needed columns
unopened_df = unopened_df.filter(["FID", "x", "y", "ClosestInt"])

In [16]:
#rename columns
unopened_df = unopened_df.rename(columns = {"FID": "cb_station", "y": "lat", "x": "long", "ClosestInt": "name"})

In [17]:
#create empty dataframe for storage
unopened_traffic_df = pd.DataFrame(columns = ['id', 'long', 'lat', 'open_year', 'name',
                                     '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'])

In [18]:
#script to get average traffic volume for all unopened capital bike shares
for xy in range(len(unopened_df)):
    #runs query for coordinates
    test_json = traffic_query([unopened_df["long"][xy], unopened_df["lat"][xy]])
    
    #adds if statement in case query fails or returns no coordinates
    if test_json == "Query Failed" or len(test_json["features"]) == 0:
        #creates dataframe with NaN values for AADT
        append = pd.DataFrame(columns = ['id', 'long', 'lat', 'open_year', 'name',
                                     '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'])
        append.loc[len(append)] = np.nan
        append['id'] = unopened_df["cb_station"][xy]
        append['long'] = unopened_df["long"][xy]
        append['lat'] = unopened_df["lat"][xy]
        append['name'] = unopened_df["name"][xy]
        #append it to traffic_df
        unopened_traffic_df = pd.concat([unopened_traffic_df, append], ignore_index = True)
        continue
        
    #cleans up resulting json
    test_clean = clean_traffic(test_json, len(test_json["features"]))
    #converts to pandas and filters for where there is data for both 2013 and 2019
    test_df = pd.DataFrame(test_clean)

    #gets mean traffic volume for all years
    test_df = test_df.agg(["mean"]).reset_index(drop = True)

    #adds station and search coordinates
    test_df['id'] = unopened_df["cb_station"][xy]
    test_df['long'] = unopened_df["long"][xy]
    test_df['lat'] = unopened_df["lat"][xy]
    test_df['open_year'] = np.nan
    test_df['name'] = unopened_df["name"][xy]
    #reorder cols to match storage dataframe
    test_df = test_df[['id', 'long', 'lat', 'open_year', 'name','2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]
    #add to storage dataframe
    unopened_traffic_df = pd.concat([unopened_traffic_df, test_df], ignore_index = True)
    

  unopened_traffic_df = pd.concat([unopened_traffic_df, test_df], ignore_index = True)


In [20]:
#saving traffic data for unopened capital bikes
unopened_traffic_df.to_csv("../data/cleaned_data/unopened_cb_traffic.csv", index = False)