# PageRank

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import networkx as nx

from collections import defaultdict as dd

from pyvis.network import Network
from datetime import datetime
from tqdm import tqdm

In [4]:
body = pd.DataFrame(columns=['자전거번호', '대여일시', '대여 대여소번호', '대여 대여소명', '대여거치대', '반납일시', '반납대여소번호',
       '반납대여소명', '반납거치대', '이용시간', '이용거리'])

In [5]:
for month in [7,9]:
    for version in range(1, 4):
        try:
            file_path = f"../dataset/temp/서울특별시 공공자전거 대여정보_20190{month}_{version}.csv"
            dataset = pd.read_pickle(file_path)[1:]
            body = body.append([dataset]).reset_index(drop=True)
        except Exception as e:
            print(e)

In [48]:
body

Unnamed: 0,자전거번호,대여일시,대여 대여소번호,대여 대여소명,대여거치대,반납일시,반납대여소번호,반납대여소명,반납거치대,이용시간,이용거리
0,SPB-21789,2019-07-02 08:35:17,00646,장한평역 1번출구 (국민은행앞),4,2019-07-02 08:42:31,00003,중랑센터,2,6,1350.00
1,SPB-22363,2019-07-03 08:35:06,00646,장한평역 1번출구 (국민은행앞),2,2019-07-03 08:42:28,00003,중랑센터,2,7,1160.00
2,SPB-24965,2019-07-03 08:46:07,00646,장한평역 1번출구 (국민은행앞),2,2019-07-03 09:09:44,00003,중랑센터,14,23,1270.00
3,SPB-21930,2019-07-04 08:49:57,00646,장한평역 1번출구 (국민은행앞),4,2019-07-04 09:04:17,00003,중랑센터,14,14,1290.00
4,SPB-17709,2019-07-05 08:45:03,00529,장한평역 8번 출구 앞,12,2019-07-05 09:11:47,00003,중랑센터,14,26,1200.00
...,...,...,...,...,...,...,...,...,...,...,...
4090195,SPB-24072,2019-09-12 08:56:34,00240,문래역 4번출구 앞,9,2019-09-12 09:03:37,99999,영남단말기정비,2,6,720.00
4090196,SPB-16130,2019-09-18 10:13:09,99999,영남단말기정비,1,2019-09-18 11:38:30,99999,영남단말기정비,1,85,40.00
4090197,SPB-03728,2019-09-25 08:00:28,02183,동방1교,7,2019-09-25 08:54:02,99999,영남단말기정비,5,53,12910.00
4090198,SPB-08928,2019-09-30 07:49:27,02183,동방1교,10,2019-09-30 09:42:27,99999,영남단말기정비,7,2,0.00


In [227]:
station_dict = {}
for index in tqdm(range(len(body))):
    target = body.loc[index]
    key_1 = int(target['대여 대여소번호'])
    value_1 = str(target['대여 대여소명'])
    
    key_2 = int(target['반납대여소번호'])
    value_2 = str(target['반납대여소명'])
    station_dict[key_1] = value_1
    station_dict[key_2] = value_2

100%|██████████| 4090200/4090200 [05:57<00:00, 11430.64it/s]


In [231]:
pregraph = dd(dict)

def pregraph_gen(graph, df):
    for i in tqdm(range(len(df))):
        rent_from = int(df.iloc[i,2])
        rent_to =   int(df.iloc[i,6])
        if rent_to not in graph[rent_from]:
            graph[rent_from][rent_to] = 1
        else:
            graph[rent_from][rent_to] +=1
            
    return graph

pregraph = pregraph_gen(pregraph, body)        

G = nx.DiGraph()
for from_key in pregraph.keys():
    current = pregraph[from_key]
    for to_key in current:
        weight = current[to_key]
        G.add_edge(from_key, to_key, weight = weight)

100%|██████████| 4090200/4090200 [02:50<00:00, 23920.07it/s]


In [235]:
import pickle
with open('station_dict.pkl', 'wb') as f:
    pickle.dump(station_dict, f)
    
with open('station_graph.pkl', 'wb') as f:
    pickle.dump(G, f)

In [7]:
import pickle

with open('station_dict.pkl', 'rb') as f:
    station_dict = pickle.load(f)
    
with open('station_graph.pkl', 'rb') as f:
    G = pickle.load(f)  

In [260]:
g = Network(height=800, width=800, notebook = True, bgcolor="#222222", font_color="white")
g.toggle_hide_edges_on_drag(True)
g.barnes_hut()
g.from_nx(G)
g.show_buttons(filter_=['physics'])
g.show("ex.html")

In [19]:
def graph_plot(graph):
    g = Network(height=800, width=800, notebook = True, bgcolor="#222222", font_color="white")
    g.toggle_hide_edges_on_drag(True)
    g.barnes_hut()
    g.from_nx(graph)
    g.show_buttons(filter_=['physics'])
    g.show("ex.html")
    
def pageranking(G, alpha = 0.8):
    pageranked = nx.pagerank(G, alpha = 0.8)
    pageranked = sorted(pageranked.items(), key=(lambda x: x[1]), reverse=True)
    
    return pageranked

In [20]:
pagerank_result = pageranking(G)

In [21]:
pagerank_result

[(502, 0.004353479411194414),
 (207, 0.003949212679946918),
 (2219, 0.003144951437294926),
 (2102, 0.0031012213177459193),
 (113, 0.002990841278043775),
 (907, 0.0028611530681718895),
 (152, 0.002697267473574053),
 (1210, 0.002547390815464556),
 (2701, 0.002502915155400228),
 (1308, 0.002449802893851744),
 (3533, 0.0022825863282344306),
 (1906, 0.0022190885084973117),
 (2177, 0.0022052718338474594),
 (2183, 0.0020973116092620207),
 (565, 0.0020518580085956044),
 (1911, 0.001967675211205042),
 (1160, 0.0019485207831146667),
 (1222, 0.0019084162130191312),
 (1503, 0.0019000602242477882),
 (1608, 0.0018852522698892182),
 (1158, 0.0018647831087418088),
 (602, 0.0018611018312558068),
 (210, 0.0018502333433418482),
 (1153, 0.001822296425312075),
 (272, 0.0017978053842574247),
 (2002, 0.001750857472364907),
 (247, 0.0017398156433849523),
 (2173, 0.0017323564228738423),
 (1044, 0.0017261824402481636),
 (583, 0.001703714375543472),
 (905, 0.001675970072041644),
 (1009, 0.0016634415266339247),
 

In [22]:
pagerank_dict = {i:j for i,j in pagerank_result}

In [32]:
for i in range(0,20):
    key = int(pagerank_result[i][0])
    print(f'Key {key}: {station_dict[key]} | Pagerank: {pagerank_dict[key]}')
    
    print(f'Predicted: {all_ * pagerank_dict[key]} Actual: {len(twenty[(twenty["대여일시"]=="2020-03-01") & (twenty["대여 대여소번호"] == key)])}')
    
    
    
    
    

Key 502: 뚝섬유원지역 1번출구 앞 | Pagerank: 0.004353479411194414
Predicted: 102.58538884538517 Actual: 151
Key 207: 여의나루역 1번출구 앞 | Pagerank: 0.003949212679946918
Predicted: 93.05924759026918 Actual: 130
Key 2219: 고속터미널역 8-1번, 8-2번 출구 사이 | Pagerank: 0.003144951437294926
Predicted: 74.10763566841763 Actual: 75
Key 2102: 봉림교 교통섬 | Pagerank: 0.0031012213177459193
Predicted: 73.07717913136484 Actual: 95
Key 113: 홍대입구역 2번출구 앞 | Pagerank: 0.002990841278043775
Predicted: 70.47618387582351 Actual: 56
Key 907: CJ 드림시티 | Pagerank: 0.0028611530681718895
Predicted: 67.4202108984024 Actual: 98
Key 152: 마포구민체육센터 앞 | Pagerank: 0.002697267473574053
Predicted: 63.558410747298986 Actual: 95
Key 1210: 롯데월드타워(잠실역2번출구 쪽) | Pagerank: 0.002547390815464556
Predicted: 60.026717175606805 Actual: 75
Key 2701: 마곡나루역 5번출구 뒤편 | Pagerank: 0.002502915155400228
Predicted: 58.97869272185097 Actual: 58
Key 1308: 안암로터리 버스정류장 앞 | Pagerank: 0.002449802893851744
Predicted: 57.727155390722494 Actual: 57
Key 3533: 건대입구역 사거리(롯데백화점) | Pa

In [8]:
twenty = pd.read_csv("../dataset/rent_record/서울특별시 공공자전거 대여이력 정보_2020.03.csv", encoding='euc-kr')
twenty['대여일시'] = pd.to_datetime(twenty['대여일시']).dt.date.astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
all_ = len(twenty[(twenty['대여일시'] == '2020-03-01')])
all_

In [17]:
len(twenty[(twenty['대여일시']=='2020-03-01') & (twenty['대여 대여소번호'] == 2102)])

95

In [23]:
all_ * pagerank_dict[502]

102.58538884538517

In [241]:
def ratio(df, date, rent_from):
    partial = len(df[(df['대여일시'] == date) & (df['대여 대여소번호'] == rent_from)])
    all_ = len(df[(df['대여일시'] == date)])
    
    return partial / all_
    
print(ratio(twenty, '2020-03-01', 502))

0.006408080122220336


In [243]:
def real_value(df, date, rate):
    all_ = len(df[(df['대여일시'] == date)])
    return int(all_ * rate)

real_value(twenty, '2020-03-01', ratio(twenty, '2020-03-01', 502))

151

In [272]:
def partial_real(df, date):
    station = list(set(df['대여 대여소번호']))
    all_ = len(df[(df['대여일시'] == date)])
    
    partials = []
    reals = []
    
    for rent_from in station:
        if rent_from not in pagerank_dict:
            continue
        rate = pagerank_dict[rent_from]
        partial = len(df[(df['대여일시'] == date) & (df['대여 대여소번호'] == rent_from)])
        real = real_value(df, date, rate)
        
        partials.append(partial)
        reals.append(real)
        
    return partials, reals

In [273]:
part, real = partial_real(twenty, '2020-03-01')

In [277]:
for i in range(len(part)):
    if part[i] != real[i]:
        print(part[i], real[i])

0 3
0 3
12 10
47 29
32 22
17 19
18 11
60 35
17 29
19 18
30 20
17 10
32 17
18 17
56 70
32 29
23 15
32 20
62 30
16 15
13 16
5 8
8 12
33 22
23 14
12 15
14 6
23 21
8 15
17 20
36 25
33 23
12 10
7 9
6 4
7 6
8 9
5 9
14 7
5 8
37 30
11 15
19 26
14 25
42 18
20 16
32 22
95 63
22 18
40 25
18 15
13 17
5 9
3 5
21 7
8 6
23 6
4 7
9 7
31 19
6 4
5 8
17 14
10 9
32 24
24 26
12 9
16 12
13 15
21 10
3 5
42 24
38 27
28 19
16 14
65 21
17 14
8 12
8 9
5 7
9 6
6 8
16 15
30 32
11 18
10 14
12 14
24 17
130 93
7 10
12 24
24 13
20 15
4 11
10 14
0 17
6 9
9 12
21 10
15 17
14 26
7 15
8 10
9 11
39 22
43 29
25 18
35 34
46 29
9 5
11 12
9 11
20 14
20 16
14 19
15 13
21 20
14 15
23 17
19 15
19 20
77 40
29 26
39 21
7 8
23 11
14 17
14 16
3 13
13 16
14 13
35 29
19 12
30 24
11 10
32 25
31 21
9 12
12 10
10 9
20 15
29 16
4 8
5 6
72 42
39 34
33 23
22 15
22 14
34 17
26 18
16 15
7 11
5 7
49 21
54 29
4 9
9 16
20 18
9 8
9 12
14 13
14 16
10 8
26 21
6 12
11 10
25 20
12 14
7 10
8 16
5 12
7 16
1 9
1 10
21 19
7 20
11 13
15 21
0 8
4 13
7 11
1 

In [261]:
len(twenty[(twenty['대여일시'] == '2020-03-01') & (twenty['대여 대여소번호'] == 3)])

0

In [275]:
from sklearn.metrics import mean_squared_error

mean_squared_error(part, real)

56.006523157208086

In [276]:
part

[0,
 0,
 12,
 47,
 32,
 17,
 18,
 60,
 17,
 19,
 30,
 17,
 32,
 18,
 56,
 32,
 23,
 32,
 62,
 16,
 13,
 5,
 9,
 8,
 33,
 23,
 12,
 22,
 14,
 23,
 8,
 17,
 36,
 33,
 12,
 7,
 6,
 7,
 8,
 6,
 5,
 14,
 5,
 16,
 37,
 11,
 19,
 14,
 42,
 20,
 32,
 95,
 22,
 40,
 18,
 13,
 13,
 5,
 3,
 21,
 8,
 23,
 4,
 9,
 14,
 31,
 6,
 5,
 17,
 9,
 10,
 32,
 24,
 12,
 16,
 13,
 21,
 3,
 42,
 38,
 28,
 16,
 65,
 17,
 8,
 8,
 19,
 5,
 9,
 6,
 10,
 9,
 16,
 30,
 11,
 10,
 12,
 24,
 130,
 7,
 43,
 12,
 24,
 20,
 4,
 10,
 14,
 0,
 6,
 9,
 21,
 15,
 14,
 13,
 7,
 8,
 9,
 39,
 43,
 25,
 35,
 10,
 46,
 9,
 11,
 9,
 20,
 20,
 14,
 15,
 21,
 14,
 10,
 23,
 19,
 19,
 77,
 29,
 39,
 7,
 23,
 14,
 14,
 3,
 13,
 14,
 35,
 19,
 30,
 11,
 32,
 31,
 9,
 12,
 10,
 20,
 29,
 4,
 5,
 72,
 39,
 33,
 22,
 22,
 34,
 26,
 16,
 7,
 5,
 49,
 54,
 4,
 9,
 20,
 9,
 9,
 14,
 14,
 10,
 26,
 6,
 11,
 25,
 12,
 7,
 8,
 5,
 13,
 7,
 1,
 1,
 21,
 7,
 11,
 15,
 0,
 4,
 7,
 1,
 7,
 6,
 7,
 10,
 16,
 7,
 5,
 14,
 9,
 6,
 4,
 6,
 16,
 18,
 18,

In [223]:
print(ratio(twenty, '2020-03-01', 907))

0.004158886436937702
