In [38]:
"""
This file can be used to generate inflfow and outflow at BOTH cluster-level (H=60 ,d = 0.5) and station-level (H=10 ,d = 0.01)

"""

'\nThis file can be used to generate inflfow and outflow at BOTH cluster-level (H=60 ,d = 0.5) and station-level (H=10 ,d = 0.01)\n\n'

In [39]:
import datetime
import cvxpy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from haversine import haversine_vector, Unit
import os

In [40]:
def temporal_clustering(H,demand):
    
    def str_to_dt(dt_str):
        return datetime.datetime.strptime(dt_str, '%H:%M:%S').time()

    
    # temporal clustering
    demand.insert(3,'start_timeslot', demand['start_time'].apply(lambda x:int(np.floor((60*str_to_dt(x).hour + str_to_dt(x).minute)/H))))
    demand.insert(6,'end_timeslot', demand['end_time'].apply(lambda x:int(np.floor((60*str_to_dt(x).hour + str_to_dt(x).minute)/H))))

In [41]:
def spatial_clustering(d,demand):
    stations = pd.read_csv('stations.csv',index_col=0)
    
    stn_coords = list(zip(stations.y, stations.x))

    D = haversine_vector(stn_coords, stn_coords, Unit.KILOMETERS, comb=True)
    R = 1*(D < d) # 1 km threshold
    
    # ilp to select cluster centers
    x = cp.Variable((len(stations),),boolean=True)
    prob = cp.Problem(cp.Minimize(cp.sum(x)), [x.T@R>=1])
    prob.solve()
    
    # map each station to a cluster
    stations['cc'] = x.value
    cluster_centers = stations.loc[stations.cc==1]
    cc_coords = list(zip(cluster_centers.x, cluster_centers.y))
    stations['cc_id'] = np.argmin(haversine_vector(cc_coords, stn_coords, Unit.KILOMETERS, comb=True), axis=1)
    
    station_to_cluster_map = dict(zip(stations.index, stations.cc_id))
    
    # spatial clustering
    demand['start_cluster'] = demand['start_station'].apply(lambda x: station_to_cluster_map[x])
    demand['end_cluster'] = demand['end_station'].apply(lambda x: station_to_cluster_map[x])
    
    return station_to_cluster_map, stations

In [42]:
""" windows 
cd C:\Work\WORK_PACKAGE\Demand_forecasting\github\blue-sg/Demand_forecasting/BLUESG_Demand_data/Data-preprocessing_data_generation
"""

' windows \ncd C:\\Work\\WORK_PACKAGE\\Demand_forecasting\\github\x08lue-sg/Demand_forecasting/BLUESG_Demand_data/Data-preprocessing_data_generation\n'

In [43]:
os.getcwd()

'/home/optimusprime/Desktop/peeterson/github/DeepAR_demand_prediction/2_freq_nbinom_LSTM/1_cluster_demand_prediction/data/demand_data/standalone'

In [44]:
if __name__ == "__main__":
    os.chdir('/home/optimusprime/Desktop/peeterson/github/DeepAR_demand_prediction/2_freq_nbinom_LSTM/1_cluster_demand_prediction/data/demand_data/standalone')
    demand = pd.read_csv('transaction_logs.csv',index_col=0)

## start
# DATAFRAME T cluster level demand

    H,d = 60,0.5 # H:Timeslot duration in minutes, d = radius for clustering in km
    #temporal clustering
    temporal_clustering(H,demand)
    
    # #spatial clustering
    # station_to_cluster_map, stations = spatial_clustering(d,demand)
    
    # print(station_to_cluster_map)
    # #print(stations['region'])
    # stations.to_csv('station_cc_id_new.csv', index=False)

    #aggregated demand table
    T = pd.pivot_table(demand, values = 'car', index=["start_date","start_timeslot"], columns=["start_station"], aggfunc=lambda x: len(x.unique()))
    IN = pd.pivot_table(demand, values = 'car', index=["end_date","end_timeslot"], columns=["end_station"], aggfunc=lambda x: len(x.unique()))
    IN.fillna(0, inplace=True)
    T.fillna(0, inplace=True)
    
    # missing_clusters = [x for x in range(T.columns[-1]) if x not in T.columns]
    # missing_clusters.sort()

    # for ms in missing_clusters:
    #     T.insert(ms, ms, 0)
    """
    Ensure that the index of T and IN are of the same length and aligned

    """
    start_idx = max(IN.index[0],T.index[0])
    end_idx = min(IN.index[-1],T.index[-1])
    T = T.loc[start_idx:end_idx]
    IN = IN.loc[start_idx:end_idx]

    print(T)
    T.to_csv('outflow_station_dem.csv', index=False)

    # missing_clusters = [x for x in range(IN.columns[-1]) if x not in IN.columns]
    # missing_clusters.sort()
    
    # for ms in missing_clusters:
    #     IN.insert(ms, ms, 0)

    print(IN)
    IN.to_csv('inflow_station_dem.csv', index=False)
    
## stop

    #raw_data = T.to_numpy()



start_station              4    5    6    7    8    9    10   11   12   13   \
start_date start_timeslot                                                     
2021-09-24 0               1.0  1.0  1.0  0.0  0.0  0.0  0.0  2.0  0.0  0.0   
           1               0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0   
           2               0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0   
           3               0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0   
           4               0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
...                        ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
2021-12-23 15              1.0  3.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0   
           16              0.0  1.0  0.0  0.0  2.0  0.0  1.0  0.0  0.0  1.0   
           17              0.0  1.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0   
           18              0.0  2.0  1.0  2.0  1.0  2.0  0.0  1.0  1.0  0.0   
           19              0.0  0.0  0.0  0.0  0.0  