# Youbike 2.0 Data Preprocessing 


sno(站點代號)、sna(中文場站名稱)、tot(場站總停車格)、sbi(可借車位數)、sarea(中文場站區域)、mday(資料更新時間)、lat(緯度)、lng(經度)、ar(中文地址)、sareaen(英文場站區域)、snaen(英文場站名稱)、aren(英文地址)、bemp(可還空位數)、act(場站是否暫停營運)
地區/
https://data.gov.tw/dataset/137993


In [205]:
import pandas as pd
pd.options.display.max_rows = 50
pd.options.display.min_rows = 50

### Cleaning column names, dropping 

In [206]:
Dmat = pd.read_csv(distpath)
Dmat.shape

(81, 82)

In [265]:
coords = pd.read_csv(coordpath)
# coords.rename(columns = {'geometry': 'lng', 'Unnamed: 10':'lat'}, inplace = True)
# coords.to_csv(coordpath, index = False)
# coords

## Preprocessing stop information

In [107]:
sectionpath = './data/Ubike分區供給.csv'
stoppath = './data/Ubike站點供給_v3.csv'

In [283]:
# fill the last stop's missing values with 0 
# (very likely an excluded stop) 
df = pd.read_csv(stoppath)
df.head()
df = df.fillna(0) 
len(df)

81

1. initial bi: `早九可借車數_pre` - `早九借車人數`
2. revise bi: check if bi is negative, |bi| exceeds `早九空柱數量_pre`, 
            meaning that we can only transport `早九空柱數量_pre` this many bikes to that demand node. 

In [282]:
def get_bi(sup_pre, dem, pole_pre):
    init = sup_pre - dem 
    if init < 0 and abs(init) > pole_pre:
        return - pole_pre
    return init 
def get_init_bi(sup_pre, dem):
    return sup_pre - dem 


In [293]:

def make_new_bi(df, 
                idx2stop, 
                ismorn = True,
                n_node = 81):

    stopinfo = df['站位名稱']
    df.fillna(0, inplace =True)
    time = '早九' if ismorn else '晚五'
    df['bi'] = df.apply(lambda x: get_bi(x[f'{time}可借車數_pre'], x[f'{time}借車人數'], x[f'{time}空柱數量_pre']), axis = 1)
    df['init_bi'] = df.apply(lambda x: get_init_bi(x[f'{time}可借車數_pre'], x[f'{time}借車人數']), axis = 1)
    
    bi = df['bi'].tolist()
    init_bi = df['init_bi'].tolist()
    stop2bi = {stopinfo[i]:(bi[i],init_bi[i]) for i in range(n_node)}
    new_bi = [None]*(n_node+1)
    new_init_bi = [None]*(n_node+1)
    for k, v in idx2stop.items():
        new_bi[k] = stop2bi[v][0]
        new_init_bi[k] = stop2bi[v][1]
    new_bi[0] = -sum(new_bi[1:])
    new_init_bi[0] = -sum(new_init_bi[1:])
    return new_bi, new_init_bi 

Modeling the formulation: https://colab.research.google.com/drive/1aKqZKc3tzzFa0wBtboRlZOYfMkwCe_UQ#scrollTo=shXIEUXdUutK

```
- n_node - 整數，租借站個數（不包含depot node）
- n_vehicle - 整數，卡車數量
- d　- 二維list，大小為n_node + 1 * n_node + 1，d[i][j]為node i到node j的距離，node 0到所有node距離為0
- b - 一維list，大小為n_node + 1, b[j]為node j的淨供給，b[0] = -(所有node總和)
- Q - 整數，一輛卡車能承載的腳踏車數
```

In [321]:
import numpy as np
def get_dmat(distpath = './data/mtx_distance.csv' , n_node = 81):
    '''to be revised after mapping is out'''
    # dm = get_dmat(distpath)
    # distpath = './data/mtx_distance.xlsx'
    dmat = pd.read_csv(distpath)
    dmat = dmat.to_numpy()
    print(dmat.shape)
    # distance_data = distance_data.index = distance_data['index']
    newd = np.zeros((n_node+1, n_node+1))
    
    newd[1:, 1:] = dmat
    print(f'(raw) n_node: {n_node} ')
    print(f'(adding depot) n_node: {n_node+1} ')
    print(f'final distance matrix size: {newd.shape}')
    return newd.tolist()
    

In [None]:
# idx2stop = pd.read_csv(dictpath, 
#                        encoding='cp950', header=None, index_col=0).squeeze("columns").to_dict()

# idx2stop = idx2stop[2]
# from math import isnan
# idx2stop = {int(k):v for k, v in idx2stop.items() if not isnan(k)}
# idx2stop[69] = '捷運科技大樓站(台北教育大學)' # typo 
# joblib.dump(idx2stop, './data/idx2stop.pkl')

In [313]:
distpath = './data/mtx_distance.csv' 
dictpath = './data/station.csv'
dmat = pd.read_csv(distpath)

In [294]:
import joblib
idx2stop = joblib.load('./data/idx2stop.pkl')
b, ib = make_new_bi(df, idx2stop)    

### outputting

In [324]:
import joblib


def generate_testcases(sd_path = stoppath, 
                      distpath = distpath,
                      ismorn = True):
    
    
    df = pd.read_csv(sd_path)
    n_node = len(df)
    d = get_dmat(distpath)
    b, init_b = make_new_bi(df, idx2stop)
    entime = 'morn' if ismorn else 'even'
    print(f'n_node: {n_node}')
    print(f'd ({len(d)} * {len(d)} list)')
    print(f'b (1 * {len(b)} list)')
    
    dire = './data/testcases'
    file_prefix = f'{dire}/testcase'
    testcase = {'n_node':n_node, 'd':d,'b':b, 'init_b': init_b} 
    # generate n_vehicle and Q 
    for n_vehicle in range(1, 5+1, 1):
        for Q in range(15, 30+1, 5):
            testcase['n_vehicle'] = n_vehicle
            testcase['Q'] = Q
            filename = f'{file_prefix}_{entime}_k{n_vehicle}_Q{Q}'
            joblib.dump(testcase, filename)
            # print(f'saving to {filename}')
            # print('--------------')


In [325]:
generate_testcases(stoppath, distpath, ismorn = True)
generate_testcases(stoppath, distpath, ismorn = False)

(81, 81)
(raw) n_node: 81 
(adding depot) n_node: 82 
final distance matrix size: (82, 82)
n_node: 81
d (82 * 82 list)
b (1 * 82 list)
(81, 81)
(raw) n_node: 81 
(adding depot) n_node: 82 
final distance matrix size: (82, 82)
n_node: 81
d (82 * 82 list)
b (1 * 82 list)


In [331]:
import joblib
prefix = './data/testcases'
testcase = joblib.load(f'{prefix}/testcase_even_k5_Q30')
n_node = testcase['n_node']
d = testcase['d']
b = testcase['b']
Q = testcase['Q']
n_vehicle = testcase['n_vehicle']
# print(testcase)
print(d.shape)
print(len(b))
print(n_node) 
print(Q)
print(n_vehicle)

(82, 82)
82
81
30
5


## Preprocessing section information 

In [28]:
secdata = pd.read_csv(sectionpath)
secdata.head()

Unnamed: 0,所屬區域,區域編號,早九可借車數,早九空柱數量,早九借車人數,早九還車人數,晚五可借車數,晚五空柱數量,晚五借車人數,晚五還車人數
0,捷運公館站周邊,1,7,37.0,86,67,8.0,37.0,24,106
1,水源校區,2,9,15.0,1,0,4.0,19.0,0,10
2,管院區,3,10,3.0,0,11,10.0,3.0,4,26
3,小小福與共同周邊,4,11,4.8,5,11,8.2,7.6,23,7
4,椰林大道前段及女五周邊,5,4,10.0,9,8,4.0,10.0,6,0
