# Youbike 2.0 Data Preprocessing 


sno(站點代號)、sna(中文場站名稱)、tot(場站總停車格)、sbi(可借車位數)、sarea(中文場站區域)、mday(資料更新時間)、lat(緯度)、lng(經度)、ar(中文地址)、sareaen(英文場站區域)、snaen(英文場站名稱)、aren(英文地址)、bemp(可還空位數)、act(場站是否暫停營運)
地區/
https://data.gov.tw/dataset/137993


In [6]:
import pandas as pd 

distpath = './data/mtx_distance.csv'
coordpath = './data/Ubike_coordinates.csv'
sectionpath = './data/Ubike分區供給.csv'
stoppath = './data/Ubike站點供給.csv'

In [38]:
pd.options.display.max_rows = 50
pd.options.display.min_rows = 50

### Cleaning column names, dropping 

In [11]:
Dmat = pd.read_csv(distpath)
Dmat.shape

(82, 82)

In [19]:
coords = pd.read_csv(coordpath)
# coords.rename(columns = {'geometry': 'lng', 'Unnamed: 10':'lat'}, inplace = True)
# coords.to_csv(coordpath, index = False)
coords

Unnamed: 0,sno,sna,sarea,ar,VILLAGE_ID,VILLAGE,TOWN,V_ID,TOWN_ID,lng,lat
0,500101008,YouBike2.0_新生南路三段52號前,大安區,新生南路三段52號,43,大學里,大安區,63000030-043,63000030,121.53407,25.02112
1,500101009,YouBike2.0_新生南路三段66號前,大安區,新生南路三段66號東側,43,大學里,大安區,63000030-043,63000030,121.53384,25.01976
2,500101010,YouBike2.0_新生南路三段82號前,大安區,新生南路三段82號,43,大學里,大安區,63000030-043,63000030,121.53361,25.01894
3,500101012,YouBike2.0_辛亥路一段30號前,大安區,辛亥路一段30號,43,大學里,大安區,63000030-043,63000030,121.52982,25.01986
4,500101014,YouBike2.0_羅斯福路三段311號前,大安區,羅斯福路三段311號,43,大學里,大安區,63000030-043,63000030,121.53202,25.01717
...,...,...,...,...,...,...,...,...,...,...,...
77,500119005,YouBike2.0_臺大水源舍區A棟,臺大專區,汀洲路三段60巷2弄路側(A舍北側),2,富水里,中正區,63000050-002,63000050,121.53044,25.01493
78,500119006,YouBike2.0_臺大卓越研究大樓,臺大專區,臺大水源舍區C南側,2,富水里,中正區,63000050-002,63000050,121.52917,25.01466
79,500119007,YouBike2.0_臺大水源修齊會館,臺大專區,思源街16號之1旁,2,富水里,中正區,63000050-002,63000050,121.52997,25.01411
80,500119008,YouBike2.0_臺大檔案展示館,臺大專區,臺大檔案展示館東北側,2,富水里,中正區,63000050-002,63000050,121.52895,25.01391


## Preprocessing stop information

In [24]:
sectionpath = './data/Ubike分區供給.csv'
stoppath = './data/Ubike站點供給_v2.csv'

In [None]:
# fill the last stop's missing values with 0 
# (very likely an excluded stop) 
df = pd.read_csv(stoppath)
df.head()
df = df.fillna(0) 

1. initial bi: `早九可借車數_pre` - `早九借車人數`
2. revise bi: check if bi is negative, |bi| exceeds `早九空柱數量_pre`, 
            meaning that we can only transport `早九空柱數量_pre` this many bikes to that demand node. 

In [45]:
def get_bi(sup_pre, dem, pole_pre):
    init = sup_pre - dem 
    if init < 0 and abs(init) > pole_pre:
        return - pole_pre
    return init 
def get_init_bi(sup_pre, dem):
    return sup_pre - dem 
    
# df['bi'] = df.apply(lambda x: get_bi(x['早九可借車數_pre'], x['早九借車人數'], x['早九空柱數量_pre']), axis = 1)
# df['init_bi'] = df.apply(lambda x: get_init_bi(x['早九可借車數_pre'], x['早九借車人數']), axis = 1)

Modeling the formulation: https://colab.research.google.com/drive/1aKqZKc3tzzFa0wBtboRlZOYfMkwCe_UQ#scrollTo=shXIEUXdUutK

```
- n_node - 整數，租借站個數（不包含depot node）
- n_vehicle - 整數，卡車數量
- d　- 二維list，大小為n_node + 1 * n_node + 1，d[i][j]為node i到node j的距離，node 0到所有node距離為0
- b - 一維list，大小為n_node + 1, b[j]為node j的淨供給，b[0] = -(所有node總和)
- Q - 整數，一輛卡車能承載的腳踏車數
```

In [78]:
import numpy as np
def get_dmat(dmatpath):
    '''to be revised after mapping is out'''
    dmat = pd.read_csv(dmatpath)
    N = len(dmat)
    
    newd = np.zeros((N+1,N+1))
    newd[1:,1:] = dmat
    print(f'(raw) n_node: {N} ')
    print(f'(adding depot) n_node: {N+1} ')
    print(f'final distance matrix size: {newd.shape}')
    return newd
    

In [79]:
dm = get_dmat(distpath)

(raw) n_node: 82 
(adding depot) n_node: 83 
final distance matrix size: (83, 83)


### outputting

In [103]:
import joblib


def generate_testcases(sd_path = stoppath, 
                      distpath = distpath,
                      ismorn = True):
    
    
    df = pd.read_csv(sd_path)
    df.fillna(0, inplace =True)
    time = '早九' if ismorn else '晚五'
    entime = 'morn' if ismorn else 'even'
    df['bi'] = df.apply(lambda x: get_bi(x[f'{time}可借車數_pre'], x[f'{time}借車人數'], x[f'{time}空柱數量_pre']), axis = 1)
    df['init_bi'] = df.apply(lambda x: get_init_bi(x[f'{time}可借車數_pre'], x[f'{time}借車人數']), axis = 1)
    
    n_node = len(df)
    d = get_dmat(distpath)
    b = df['bi'].tolist()
    init_b = df['init_bi'].tolist()

    # 為什麼要是  -(所有node總和)？
    b.insert(0, - sum(b))
    init_b.insert(0, - sum(init_b))
    
    print(f'n_node: {n_node}')
    print(f'd ({len(d)} * {len(d)} list)')
    print(f'b (1 * {len(b)} list)')
    
    dire = './data/testcases'
    file_prefix = f'{dire}/testcase'
    testcase = {'n_node':n_node, 'd':d,'b':b, 'init_b': init_b} 
    # generate n_vehicle and Q 
    for n_vehicle in range(1, 5+1, 1):
        for Q in range(15, 30+1, 5):
            testcase['n_vehicle'] = n_vehicle
            testcase['Q'] = Q
            filename = f'{file_prefix}_{entime}_k{n_vehicle}_Q{Q}'
            joblib.dump(testcase, filename)
            # print(f'saving to {filename}')
            # print('--------------')


In [104]:
generate_testcases(stoppath, distpath, ismorn = True)
generate_testcases(stoppath, distpath, ismorn = False)

(raw) n_node: 82 
(adding depot) n_node: 83 
final distance matrix size: (83, 83)
n_node: 82
d (83 * 83 list)
b (1 * 83 list)
(raw) n_node: 82 
(adding depot) n_node: 83 
final distance matrix size: (83, 83)
n_node: 82
d (83 * 83 list)
b (1 * 83 list)


In [95]:
import joblib
joblib.load('./data/testcases/testcase_k1_Q15')

{'n_node': 82,
 'd': array([[  0.        ,   0.        ,   0.        , ...,   0.        ,
           0.        ,   0.        ],
        [  0.        ,   0.        , 137.93114224, ..., 812.09666912,
         884.29915753, 709.20589394],
        [  0.        , 137.93114224,   0.        , ..., 684.8313661 ,
         762.46049078, 586.53729634],
        ...,
        [  0.        , 812.09666912, 684.8313661 , ...,   0.        ,
         103.94229168, 114.62983905],
        [  0.        , 884.29915753, 762.46049078, ..., 103.94229168,
           0.        , 175.93180497],
        [  0.        , 709.20589394, 586.53729634, ..., 114.62983905,
         175.93180497,   0.        ]]),
 'b': [-526.0,
  -10.0,
  -12.0,
  -26.0,
  -7.0,
  4.0,
  -1.0,
  4.0,
  10.0,
  18.0,
  4.0,
  20.0,
  2.0,
  11.0,
  23.0,
  2.0,
  11.0,
  0.0,
  26.0,
  9.0,
  11.0,
  4.0,
  9.0,
  -1.0,
  -2.0,
  8.0,
  0.0,
  3.0,
  1.0,
  8.0,
  1.0,
  0.0,
  1.0,
  1.0,
  11.0,
  6.0,
  -5.0,
  -5.0,
  -7.0,
  -9.0,
  0.0,

## Preprocessing section information 

In [28]:
secdata = pd.read_csv(sectionpath)
secdata.head()

Unnamed: 0,所屬區域,區域編號,早九可借車數,早九空柱數量,早九借車人數,早九還車人數,晚五可借車數,晚五空柱數量,晚五借車人數,晚五還車人數
0,捷運公館站周邊,1,7,37.0,86,67,8.0,37.0,24,106
1,水源校區,2,9,15.0,1,0,4.0,19.0,0,10
2,管院區,3,10,3.0,0,11,10.0,3.0,4,26
3,小小福與共同周邊,4,11,4.8,5,11,8.2,7.6,23,7
4,椰林大道前段及女五周邊,5,4,10.0,9,8,4.0,10.0,6,0
