# Skyss API Data Download
This notebook contains the code to Download the Data from Skyss API.

## Install packages
Use pip install or poetry install to install the packages.
Run the following commands from the terminal open in root directory of this repository.
```
poetry config virtualenvs.in-project true
poetry install
```
These will create a virtual environment and install the packages specified in the pyproject.toml file. This virtual environment can be used to run this notebook.

## Import Libraries

In [2]:
 # Import the required packeages
import requests
import json
from datetime import datetime, timedelta
import pandas as pd
import os
import urllib3
urllib3.disable_warnings()
import time
from requests import Request, Session
from concurrent.futures import ThreadPoolExecutor, as_completed

## API Config/Setup
Set the environment variables (when running locally)

In [None]:
skyss_api_username = os.getenv('SKYSS_API_USERNAME','__')
skyss_api_password = os.getenv('SKYSS_API_PASSWORD','__')
skyss_api_client_secret = os.getenv('SKYSS_API_CLIENT_SECRET','__')

##  Various functions for getting/downloading data
To download the data from Skyss API

In [None]:
# Define the function for token
def get_bearer_token():
    '''A function to get the Access Token for connecting to API'''
    # Define the required parameters
    username = skyss_api_username
    password = skyss_api_password
    auth_url = 'https://skysspublicapi-identityserver-test.azurewebsites.net/connect/token'
    client_id = 'user-client'
    client_secret = skyss_api_client_secret
    # Submit the Post Request
    data = {'grant_type': 'password', 'username': username, 'password': password}
    access_token_response = requests.post(auth_url, data=data, verify=False, allow_redirects=False, auth=(client_id, client_secret))
    # Print the Status Code
    #print(access_token_response.status_code)
    tokens = json.loads(access_token_response.text)
    token = 'Bearer ' + tokens['access_token']
    return token 

In [None]:
# Define the function for lines data
def getLines(bearer_token):
    url = 'https://skyss-public-api-test.azurewebsites.net/api/Lines'
    payload={}
    headers = {
      'Authorization': bearer_token
    }
    response = requests.request("GET", url, headers=headers, data=payload)
    return response.json()

In [None]:
# Define the function for StopPoints data
def getStopPoints(bearer_token):
    url = 'https://skyss-public-api-test.azurewebsites.net/api/StopPoints'
    payload={}
    headers = {
      'Authorization': bearer_token
    }
    response = requests.request("GET", url, headers=headers, data=payload)
    return response.json()

In [None]:
# Define the function for the Stops data length
def getStopsDataLength(bearer_token, from_date, to_date):
     # Get the Length of Data between two dates
    length_url = "https://skyss-public-api-test.azurewebsites.net/api/stopPoints/Data/Length"
    payload_length = { 'FromOperatingDate': from_date,
			   'ToOperatingDate': to_date }
    headers = { 'Authorization':bearer_token }
    status = 0
    while status!=200:
        try:
            response = requests.request("GET", length_url, headers=headers, params=payload_length)
            status = response.status_code
        except:
            print('Request failed for length, Trying again')
            bearer_token = get_bearer_token()
            headers = { 'Authorization':bearer_token }
            response = requests.request("GET", length_url, headers=headers, params=payload_length)
            status = response.status_code
    data_length = response.json()
    print(f'Data Length from {from_date} to {to_date} is : {data_length}')
    return data_length 

In [None]:
# Date processing function
def date_to_iso_str(date_dt):
    '''Function to convert datetime into specific string format'''
    return datetime.strftime(date_dt,"%Y-%m-%dT%H:%M:%S.%f")[:-3]+"Z" 

In [None]:
trips_df_dtypes = {
    'id' : 'object', 
    'globalTripId': 'object',
    'tripKey': 'object',
    'tripStatus': 'object',
    'operatingDate': 'object',
    'timeKey' : 'int64', 
    'lineKey' : 'int64', 
    'lineNameShort': 'object', 
    'routeFromToKey': 'object',
    'stopKey' : 'int64',
    'nsrQuay': 'object',
    'sequenceInJourney' : 'int64',
    'isValidAPC' : 'int64',
    'onboard' : 'float64',
    'enteredIn' : 'float64',
    'enteredOut' : 'float64',
    'directionCode' : 'int64',
    'plannedDepartureTime': 'object',
    'plannedTimeSinceLastStop' : 'float64',
    'actualDepartureTime': 'object',
    'actualTimeSinceLastStop' : 'float64',
    'stopDurationSeconds' : 'float64',
    'actualRideTime' : 'float64',
    'delayOnDepartureSeconds' : 'float64',
    'measuredDistanceToPrevPointInJourney' : 'float64',
    'totalTripLength' : 'float64',
    'isRouteOrigin' : 'int64',
    'isRouteDestination' : 'int64',
    'seatedCapacity' : 'int64',
    'standingCapacity' : 'int64',
    'allowedOnboard' : 'int64',
    'realisticCapacity' : 'int64',
    'isSchoolVacation' : 'int64',
    'isPublicHoliday' : 'int64',
    'changedAfterDate': 'object',
    'actualArrivalTime': 'object',
    'plannedArrivalTime': 'object'
}

In [None]:
# Helper function for Sending the Prepared Request & download the data
def download_req(req,s):
    # Check with a flag for successful download/request
    status = 0
    while status!=200:
        try:
            resp = s.send(req)
            status = resp.status_code
        except:
            req.headers['Authorization'] = get_bearer_token()
            resp = s.send(req)
            status = resp.status_code
    data = resp.json()
    return data

# Define the function for Stops data
def getStopsData(bearer_token,from_date,to_date,data_folder):
    '''Function to get the Trips Data for Stops, between dates and filtered by Start and Count'''
    data_length = getStopsDataLength(bearer_token, from_date, to_date)
    data_list = []
    # Get the Data for a Start Point and Count of Rows
    data_url = "https://skyss-public-api-test.azurewebsites.net/api/stopPoints/Data"
    headers = { 'Authorization':bearer_token }
    start_row = 0
    counts_row = 2000
    # Make the prepared requests list
    req_list = []
    while start_row<data_length:
        payload_data = { 'FromOperatingDate': from_date,
                'ToOperatingDate': to_date,
                'start': start_row,
                'count': counts_row }
        req = Request("GET", data_url, headers=headers, params=payload_data)
        prepped = req.prepare()
        req_list = req_list+[prepped]
        # Update the Start row for the next iteration in loop
        start_row = start_row+counts_row
    # Now Use Multithreading and Submit the requests
    threads= []
    with ThreadPoolExecutor(max_workers=50) as executor:
        s = Session()
        for req in req_list:
            threads.append(executor.submit(download_req, req, s)) 
        for task in as_completed(threads):
            data_list = data_list + task.result()
    # Save the data into csv files
    df = pd.DataFrame(data_list)
    df = df.astype(trips_df_dtypes)
    # Also fix the values in some columns
    # Fixing the 0 value in tripStatus
    df.loc[df['tripStatus']=='0','tripStatus']='Zero'
    df['tripStatus'] = df['tripStatus'].str.decode('utf8')
    # Filename will be the from_date + .csv or .parquet
    from_date_str = from_date.split('T')[0]
    #file_name = from_date_str+'.csv'
    file_name = from_date_str+'.parquet'
    # save the file in data_folder
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    #df.to_csv(data_folder+'/'+file_name,index=False) 
    df.to_parquet(data_folder+'/'+file_name,index=False,engine='fastparquet',object_encoding='utf8') 

In [None]:
# Function to get the Trips Data
def getTripsData(bearer_token, from_operating_date, to_operating_date,data_folder):
    '''Download the Stops Data into datewise csv files for the given date range'''
    current_date = from_operating_date
    while current_date!=to_operating_date:
        next_datetime = pd.to_datetime(current_date) + timedelta(days=1)
        next_date = date_to_iso_str(next_datetime)
        getStopsData(bearer_token, current_date, next_date,data_folder)
        current_date = next_date

In [None]:
# Function to download the data
def downloadData(data_name, data_params=None):
    '''Function to download the dataset given the parameters'''
    # Get the required token
    bearer_token = get_bearer_token()
    if data_name=='lines':
      # Call the getLines function
      lines_data = getLines(bearer_token)
      # Lines data in dataframe format
      lines_df = pd.DataFrame(lines_data)
      #Write (.csv) the lines data
      lines_df.to_csv('Lines.csv',index=False)
    elif data_name=='stops':
      # Call the getStopPoints function
      StopPoints_data = getStopPoints(bearer_token)
      # StopPoints data in dataframe format
      StopPoints_df = pd.DataFrame(StopPoints_data)
      # Read and write (.csv) the stop pints data
      StopPoints_df.to_csv('StopPoints.csv',index=False)
    elif data_name=='trips':
      # Download the daily data for a range of dates
      from_operating_date = data_params['from_operating_date']
      to_operating_date = data_params['to_operating_date']
      data_folder = data_params['data_folder']
      getTripsData(bearer_token, from_operating_date, to_operating_date,data_folder)

## Download the different datasets

In [None]:
# For example download the lines data
downloadData('lines')

In [None]:
# For example download the stops data
downloadData('stops')

In [None]:
# For example download the trips data ( Note, set the data_params accordingly)
# Set the data_params for Trips Data
data_params = { 
'from_operating_date' : '2019-10-01T05:00:00.000Z',
'to_operating_date' : '2019-12-01T05:00:00.000Z',
'data_folder' : 'drive/MyDrive/Trips_pq' # Don't forget to mount the drive
}

start = time.time()
print("Start Time")
print(time.ctime())
#downloadData('trips', data_params)
end = time.time()
print("End Time")
print(time.ctime())
print("Time Taken (in Minutes)")
print(int((end - start)/60))