In [15]:
"""
This file can be used to generate inflfow and outflow at BOTH cluster-level (H=60 ,d = 0.5) and station-level (H=10 ,d = 0.01)

cluster parameters:
columns=["start_cluster"] #for station demand
H,d = 60,0.5 

station parameters:
columns=["start_station"] #for station demand
H,d = 10,0.01 

"""

'\nThis file can be used to generate inflfow and outflow at BOTH cluster-level (H=60 ,d = 0.5) and station-level (H=10 ,d = 0.01)\n\ncluster parameters:\ncolumns=["start_cluster"] #for station demand\nH,d = 60,0.5 \n\nstation parameters:\ncolumns=["start_station"] #for station demand\nH,d = 10,0.01 \n\n'

In [16]:
import datetime
import cvxpy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from haversine import haversine_vector, Unit
import os

In [17]:
def temporal_clustering(H,demand):
    
    def str_to_dt(dt_str):
        return datetime.datetime.strptime(dt_str, '%H:%M:%S').time()

    
    # temporal clustering
    demand.insert(3,'start_timeslot', demand['start_time'].apply(lambda x:int(np.floor((60*str_to_dt(x).hour + str_to_dt(x).minute)/H))))
    demand.insert(6,'end_timeslot', demand['end_time'].apply(lambda x:int(np.floor((60*str_to_dt(x).hour + str_to_dt(x).minute)/H))))

In [18]:
def spatial_clustering(d,demand):
    stations = pd.read_csv('stations.csv',index_col=0)
    
    stn_coords = list(zip(stations.y, stations.x))

    D = haversine_vector(stn_coords, stn_coords, Unit.KILOMETERS, comb=True)
    R = 1*(D < d) # 1 km threshold
    
    # ilp to select cluster centers
    x = cp.Variable((len(stations),),boolean=True)
    prob = cp.Problem(cp.Minimize(cp.sum(x)), [x.T@R>=1])
    prob.solve()
    
    # map each station to a cluster
    stations['cc'] = x.value
    cluster_centers = stations.loc[stations.cc==1]
    cc_coords = list(zip(cluster_centers.y, cluster_centers.x))
    stations['cc_id'] = np.argmin(haversine_vector(cc_coords, stn_coords, Unit.KILOMETERS, comb=True), axis=1)
    
    station_to_cluster_map = dict(zip(stations.index, stations.cc_id))
    
    # spatial clustering
    demand['start_cluster'] = demand['start_station'].apply(lambda x: station_to_cluster_map[x])
    demand['end_cluster'] = demand['end_station'].apply(lambda x: station_to_cluster_map[x])
    
    return station_to_cluster_map, stations

In [19]:
""" windows 
cd C:\Work\WORK_PACKAGE\Demand_forecasting\github\blue-sg/Demand_forecasting/BLUESG_Demand_data/Data-preprocessing_data_generation
"""

' windows \ncd C:\\Work\\WORK_PACKAGE\\Demand_forecasting\\github\x08lue-sg/Demand_forecasting/BLUESG_Demand_data/Data-preprocessing_data_generation\n'

In [20]:
os.getcwd()

'/home/optimusprime/Desktop/peeterson/github/DeepAR_demand_prediction/2_freq_nbinom_LSTM/1_cluster_demand_prediction/data/demand_data/standalone'

In [21]:
if __name__ == "__main__":
    os.chdir('/home/optimusprime/Desktop/peeterson/github/DeepAR_demand_prediction/2_freq_nbinom_LSTM/1_cluster_demand_prediction/data/demand_data/standalone')
    demand = pd.read_csv('transaction_logs.csv',index_col=0)

## start
# DATAFRAME T cluster level demand

    H,d = 60,0.5 # H:Timeslot duration in minutes, d = radius for clustering in km
    #temporal clustering
    temporal_clustering(H,demand)
    
    #spatial clustering
    station_to_cluster_map, stations = spatial_clustering(d,demand)
    print(station_to_cluster_map)
    #print(stations['region'])
    stations.to_csv('station_cc_id_'+str(H)+'_'+str(d)+'.csv', index=False)

    #aggregated demand table
    T = pd.pivot_table(demand, values = 'car', index=["start_date","start_timeslot"], columns=["start_cluster"], aggfunc=lambda x: len(x.unique()))
    IN = pd.pivot_table(demand, values = 'car', index=["end_date","end_timeslot"], columns=["end_cluster"], aggfunc=lambda x: len(x.unique()))
    IN.fillna(0, inplace=True)
    T.fillna(0, inplace=True)
    
    # missing_clusters = [x for x in range(T.columns[-1]) if x not in T.columns]
    # missing_clusters.sort()

    # for ms in missing_clusters:
    #     T.insert(ms, ms, 0)
    """
    Ensure that the index of T and IN are of the same length and aligned

    """
    start_idx = max(IN.index[0],T.index[0])
    end_idx = min(IN.index[-1],T.index[-1])
    T = T.loc[start_idx:end_idx]
    IN = IN.loc[start_idx:end_idx]

    print(T)
    T.to_csv('outflow_cluster_dem.csv', index=False)

    # missing_clusters = [x for x in range(IN.columns[-1]) if x not in IN.columns]
    # missing_clusters.sort()
    
    # for ms in missing_clusters:
    #     IN.insert(ms, ms, 0)

    print(IN)
    IN.to_csv('inflow_cluster_dem.csv', index=False)
    
## stop

    #raw_data = T.to_numpy()



{34: 2, 20: 0, 298: 1, 140: 1, 329: 1, 285: 2, 272: 2, 286: 3, 203: 4, 294: 5, 328: 8, 195: 5, 46: 6, 119: 7, 355: 8, 77: 8, 93: 10, 192: 9, 35: 10, 348: 9, 163: 10, 310: 11, 202: 11, 318: 12, 245: 12, 206: 13, 352: 14, 190: 11, 178: 15, 129: 16, 166: 16, 345: 16, 370: 17, 29: 171, 18: 18, 331: 19, 218: 19, 251: 19, 56: 20, 249: 21, 149: 22, 154: 23, 326: 23, 210: 24, 191: 23, 32: 25, 239: 24, 388: 27, 278: 25, 306: 25, 186: 26, 170: 27, 94: 30, 139: 28, 141: 31, 377: 29, 116: 30, 209: 34, 376: 45, 151: 30, 385: 33, 132: 32, 78: 31, 366: 32, 150: 31, 320: 33, 9: 33, 155: 34, 389: 35, 288: 34, 58: 34, 323: 36, 162: 39, 43: 36, 330: 36, 325: 37, 118: 38, 110: 39, 15: 39, 341: 37, 307: 40, 84: 40, 304: 38, 138: 41, 296: 42, 147: 43, 126: 44, 228: 48, 263: 54, 113: 45, 243: 46, 81: 48, 74: 47, 257: 48, 368: 54, 82: 49, 5: 50, 98: 51, 47: 51, 17: 52, 37: 55, 168: 53, 41: 45, 40: 54, 16: 105, 28: 55, 26: 108, 30: 54, 344: 56, 73: 57, 64: 58, 181: 58, 128: 59, 378: 60, 281: 60, 271: 62, 275: 