In [20]:
import datetime

import cvxpy as cp

import numpy as np
import pandas as pd


import matplotlib.pyplot as plt

from haversine import haversine_vector, Unit

In [21]:
def temporal_clustering(H,demand):
    
    def str_to_dt(dt_str):
        return datetime.datetime.strptime(dt_str, '%H:%M:%S').time()

    def timeslot(t,H):
        return datetime.time(hour=int(H*np.floor(t.hour/H)))
    
    # temporal clustering
    demand.insert(3,'start_timeslot', demand['start_time'].apply(lambda x: timeslot(str_to_dt(x),H)))
    demand.insert(6,'end_timeslot', demand['end_time'].apply(lambda x: timeslot(str_to_dt(x),H)))

In [22]:
def spatial_clustering(d,demand):
    stations = pd.read_csv('stations.csv',index_col=0)
    
    stn_coords = list(zip(stations.x, stations.y))

    D = haversine_vector(stn_coords, stn_coords, Unit.KILOMETERS, comb=True)
    R = 1*(D < d) # 1 km threshold
    
    # ilp to select cluster centers
    x = cp.Variable((len(stations),),boolean=True)
    prob = cp.Problem(cp.Minimize(cp.sum(x)), [x.T@R>=1])
    prob.solve()
    
    # map each station to a cluster
    stations['cc'] = x.value
    cluster_centers = stations.loc[stations.cc==1]
    cc_coords = list(zip(cluster_centers.x, cluster_centers.y))
    stations['cc_id'] = np.argmin(haversine_vector(cc_coords, stn_coords, Unit.KILOMETERS, comb=True), axis=1)
    
    station_to_cluster_map = dict(zip(stations.index, stations.cc_id))
    
    # spatial clustering
    demand['start_cluster'] = demand['start_station'].apply(lambda x: station_to_cluster_map[x])
    demand['end_cluster'] = demand['end_station'].apply(lambda x: station_to_cluster_map[x])
    
    return station_to_cluster_map, stations

In [23]:
cd C:\Work\WORK_PACKAGE\Demand_forecasting\github\blue-sg/Demand_forecasting/BLUESG_Demand_data/Data-preprocessing_data_generation

[WinError 3] The system cannot find the path specified: 'C:\\Work\\WORK_PACKAGE\\Demand_forecasting\\github\\blue-sg/Demand_forecasting/BLUESG_Demand_data/Data-preprocessing_data_generation'
c:\Work\WORK_PACKAGE\Demand_forecasting\BLUESG_Demand_data\Data-preprocessing_data_generation


In [24]:
ls

 Volume in drive C is Windows 
 Volume Serial Number is A2F9-40D5

 Directory of c:\Work\WORK_PACKAGE\Demand_forecasting\BLUESG_Demand_data\Data-preprocessing_data_generation

30/01/2023  06:23 pm    <DIR>          .
30/01/2023  06:23 pm    <DIR>          ..
29/06/2022  02:46 pm            16,540 Clstr_inflow_outflow_data_generator_new.ipynb
08/04/2022  02:31 pm        51,077,895 demand.csv
29/06/2022  02:37 pm         1,781,831 inflow_clstr_dem.csv
29/06/2022  02:50 pm           892,411 outflow_clstr_dem.csv
17/04/2022  08:54 pm           120,634 stations.csv
               5 File(s)     53,889,311 bytes
               2 Dir(s)  124,453,871,616 bytes free


In [25]:
if __name__ == "__main__":
    demand = pd.read_csv('transaction_logs.csv',index_col=0)

## start
# DATAFRAME T cluster level demand

    H,d = 1,0.25 # H:Timeslot suration in hours, d = radius for clustering in km
    #temporal clustering
    temporal_clustering(H,demand)
    
    #spatial clustering
    station_to_cluster_map, stations = spatial_clustering(d,demand)
    
    print(station_to_cluster_map)
    #print(stations['region'])
    stations.to_csv('station_cc_id_new.csv', index=False)

    #aggregated demand table
    T = pd.pivot_table(demand, values = 'car', index=["start_date","start_timeslot"], columns=["start_cluster"], aggfunc=lambda x: len(x.unique()))
    IN = pd.pivot_table(demand, values = 'car', index=["end_date","end_timeslot"], columns=["end_cluster"], aggfunc=lambda x: len(x.unique()))
    IN.fillna(0, inplace=True)
    T.fillna(0, inplace=True)
    
    missing_clusters = [x for x in range(T.columns[-1]) if x not in T.columns]
    missing_clusters.sort()

    for ms in missing_clusters:
        T.insert(ms, ms, 0)

    print(T)

    T.to_csv('outflow_clstr_dem_new.csv', index=False)

    missing_clusters = [x for x in range(IN.columns[-1]) if x not in IN.columns]
    missing_clusters.sort()
    
    for ms in missing_clusters:
        IN.insert(ms, ms, 0)

    print(IN)
    IN.to_csv('inflow_clstr_dem_new.csv', index=False)
    
## stop

    #raw_data = T.to_numpy()



{34: 1, 20: 0, 298: 0, 140: 0, 329: 0, 285: 1, 272: 2, 286: 2, 203: 3, 294: 3, 328: 7, 195: 4, 46: 5, 119: 6, 355: 7, 77: 7, 93: 8, 192: 9, 35: 10, 348: 9, 163: 11, 310: 12, 202: 12, 318: 13, 245: 13, 206: 14, 352: 9, 190: 12, 178: 10, 129: 15, 166: 16, 345: 16, 370: 17, 29: 18, 18: 19, 331: 20, 218: 18, 251: 20, 56: 21, 249: 22, 149: 22, 154: 23, 326: 23, 210: 24, 191: 24, 32: 26, 239: 24, 388: 25, 278: 26, 306: 26, 186: 23, 170: 25, 94: 27, 139: 33, 141: 28, 377: 29, 116: 30, 209: 32, 376: 31, 151: 30, 385: 28, 132: 32, 78: 28, 366: 32, 150: 27, 320: 33, 9: 33, 155: 35, 389: 34, 288: 35, 58: 35, 323: 36, 162: 37, 43: 38, 330: 38, 325: 40, 118: 39, 110: 37, 15: 40, 341: 38, 307: 36, 84: 38, 304: 37, 138: 41, 296: 42, 147: 43, 126: 42, 228: 44, 263: 45, 113: 46, 243: 49, 81: 47, 74: 48, 257: 52, 368: 49, 82: 44, 5: 50, 98: 51, 47: 51, 17: 47, 37: 52, 168: 53, 41: 52, 40: 49, 16: 45, 28: 54, 26: 55, 30: 49, 344: 56, 73: 57, 64: 58, 181: 59, 128: 60, 378: 61, 281: 59, 271: 63, 275: 62, 2