In [1]:
import pandas as pd
import time
import requests
from datetime import date

In [2]:
# historical data is missing the station id, so I get them from the endpoint
def get_station_ids(): 
    r = requests.get('https://gbfs.velobixi.com/gbfs/en/station_information.json')
    df_stations = pd.DataFrame.from_dict(r.json()['data']['stations'])
    df_stations = df_stations[['station_id','name']]
    df_stations['station_id'] = pd.to_numeric(df_stations['station_id'])
    return df_stations


In [3]:
def add_id_to_stations(df):
    #call endpoint in order to get a json of all current stations
    df_id = get_station_ids()
    #merge on the station name and rename the columns/ clean up data
    df_merged_start = df.merge(df_id, left_on = ['STARTSTATIONNAME'], right_on= ['name'])
    df_merged_start.rename(columns={'station_id':'STARTSTATIONID'},inplace=True)
    df_merged_start_end = df_merged_start.merge(df_id, left_on = ['ENDSTATIONNAME'], right_on= ['name'])
    df_merged_start_end.rename(columns={'station_id':'ENDSTATIONID'},inplace=True)
    df_merged_start_end.drop(['name_x','name_y'],axis=1, inplace=True)
    df_merged_start_end.reset_index(inplace=True)
    
    return df_merged_start_end

In [4]:
#todo add column of date month year hour minute second

In [5]:
# def find_delta_time_range(timestart=None, timeend=None):
#     #calculate the time each trip took
#     start_utc_time = datetime.strptime(timestart, "%Y-%m-%dT%H:%M:%S.%fZ")
#     end_utc_time = datetime.strptime(timeend, "%Y-%m-%dT%H:%M:%S.%fZ")
#     #epoch_time = (utc_time - datetime(1970, 1, 1)).total_seconds()
#     raw_data = pd.read_csv(filename, sep=',')
#     df = pd.DataFrame(raw_data)

#     #if year is not mentionned calculate average accross all years


#     #if month not mentioned 
#     df_rank["DELTATIME"] = df_rank["ENDTIMEMS"]-df_rank["STARTTIMEMS"] 
#     df_rank['DELTATIME'] = pd.to_timedelta(df_rank['DELTATIME'], unit='ms')
#     df_rank.sort_values('count',ascending=False, inplace=True)
#     mean_df= df_rank.groupby(['STARTSTATIONNAME','ENDSTATIONNAME','count'],as_index=False)['DELTATIME'].mean()
#     mean_df.rename(columns={'count':'NUMBEROFTRIPS'}, inplace=True)
#     mean_df.sort_values('DELTATIME',ascending=False, inplace=True)

In [6]:
def find_mean_time(filename):
    
    #read csv and create df, dropping rows with N/A values
    raw_data = pd.read_csv(filename, sep=',')
    df = pd.DataFrame(raw_data)
    df = df.dropna()
    df = df[['STARTSTATIONNAME','ENDSTATIONNAME','STARTTIMEMS','ENDTIMEMS']]

    #count number of trips between two stations and add value as new column 
    nb_trips = pd.DataFrame(df.value_counts(['STARTSTATIONNAME','ENDSTATIONNAME'])).reset_index()
    df_rank = df.merge(nb_trips, left_on = ['STARTSTATIONNAME','ENDSTATIONNAME'], right_on= ['STARTSTATIONNAME','ENDSTATIONNAME'])     
    df_rank.sort_values(by='STARTSTATIONNAME',inplace=True)

    #calculate the time each trip took
    df_rank["DELTATIME"] = df_rank["ENDTIMEMS"]-df_rank["STARTTIMEMS"] 
    df_rank['DELTATIME'] = pd.to_timedelta(df_rank['DELTATIME'], unit='ms')
    df_rank.sort_values('count',ascending=False, inplace=True)
    mean_df= df_rank.groupby(['STARTSTATIONNAME','ENDSTATIONNAME','count'],as_index=False)['DELTATIME'].mean()
    mean_df.rename(columns={'count':'NUMBEROFTRIPS'}, inplace=True)
    mean_df.sort_values('DELTATIME',ascending=False, inplace=True)
    return mean_df

In [7]:
df= find_mean_time('DonneesOuvertes2023.csv')
df

Unnamed: 0,STARTSTATIONNAME,ENDSTATIONNAME,NUMBEROFTRIPS,DELTATIME
344743,Ste-Émilie / Sir-Georges-Etienne-Cartier,10e avenue / Masson,1,166 days 05:31:53.517000
168747,Mairie d'arrondissement St-Laurent (Beaudet / ...,de la Peltrie / de la Côte-des-Neiges,1,91 days 22:28:00.311000
80582,Crescent / de Maisonneuve N/E,Métro Verdun (Willibrord / de Verdun),2,59 days 17:25:01.657000
289476,Richmond / des Bassins,Gare Montpellier (Muir / de la Côte-Vertu),1,58 days 22:53:08.858000
428065,du Souvenir / Chomedey,Notre-Dame-de-Grâce / Westmount,1,44 days 19:19:59.031000
...,...,...,...,...
334504,St-Viateur / Casgrain,Henri-Julien / du Carmel,3,0 days 00:01:16.196333333
334635,St-Viateur / Casgrain,St-Dominique / St-Viateur,2,0 days 00:01:14.273500
343762,Ste-Cathrine / Drummond,Ste-Cathrine / Drummond,11,0 days 00:00:50.049818181
387406,de Maisonneuve / Mackay,Mackay / Ste-Catherine,1,0 days 00:00:47.029000


In [8]:
df2= add_id_to_stations(df)


In [9]:
df2.to_csv('average_time_between_stations.csv',index=False)

In [10]:
def find_station_most_popular_destination(name):
    return df2[df2['STARTSTATIONNAME']==name].sort_values(by="NUMBEROFTRIPS", ascending=False).head(n=1)

In [11]:
find_station_most_popular_destination("Peel / Notre-Dame")

Unnamed: 0,index,STARTSTATIONNAME,ENDSTATIONNAME,NUMBEROFTRIPS,DELTATIME,STARTSTATIONID,ENDSTATIONID
19945,19945,Peel / Notre-Dame,Peel / Notre-Dame,1400,0 days 00:13:13.577770,77,77


In [None]:
def find_station_most_popular_destination_id(id):
    return df2[df2['STARTSTATIONID']==id].sort_values(by="NUMBEROFTRIPS", ascending=False).head(n=1)

In [None]:
find_station_most_popular_destination_id(421)

In [None]:
def lambda_handler(event, context):
    raw_data = pd.read_csv('DonneesOuvertes2023.csv', sep=',')
    