In [1]:
import sys
import pickle
from pprint import pprint 
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from data.make_casting_graph import oneway_to_bidirected_graph
from scipy.sparse import csc_matrix
import time
from pagerank import pagerank
from sklearn.preprocessing import normalize
from pyvis.network import Network
from pagerank import pagerank

In [2]:
# create idx to num comments
with open('./data/ratings.csv', encoding='utf-8') as f:
    docs = [line.strip().split(',') for line in f.readlines()[1:]]
    _idx2numcomments = {movie_idx:int(num) for num, movie_idx in docs}

In [3]:
# pre defined casting weight graph
with open('./data/casting_graph.pkl', 'rb') as f:
    graph = pickle.load(f)

In [4]:
# create idx to actor name function
with open('./data/actors.csv', encoding='utf-8') as f:
    next(f)
    docs = [line.split(',') for line in f.readlines()[1:]]
    # English name if exist else Korean name
    _idx2actor = {doc[0]:doc[1] for doc in docs}

In [5]:
with open('./data/movies.csv', encoding='utf-8') as f:
    next(f)
    docs = [line.split(',') for line in f.readlines()[1:]]
    _idx2movie = {doc[0]:doc[1] for doc in docs if len(docs)}

In [6]:
idx2movie = lambda idx: _idx2movie.get(idx, 'Unknown')
idx2actor = lambda idx: _idx2actor.get(idx, 'Unknown')
idx2numcomments = lambda idx: _idx2numcomments.get(idx,0)

In [7]:
g = oneway_to_bidirected_graph(graph)

### [실습1] 리뷰가 많은 영화 순위

In [8]:
for movie in sorted(_idx2numcomments.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(idx2movie(movie[0]), movie[1])

기생충 40
극한직업 15
마약왕 15
인터스텔라 14
어벤져스: 엔드게임 12
걸캅스 12
마녀 12
택시운전사 11
배심원들 11
신과함께-죄와 벌 11


### [실습2] Dict를 이용한 PageRank

In [10]:
def make_graph(casting_csv_path):
    # load file
    with open(casting_csv_path, encoding='utf-8') as f:
        next(f)
        graph = {line.split('\t')[0]:line.split('\t')[1].strip().split() for line in f if len(line.split('\t'))==2}
    
    # weighting
    # casting order (n-i)^2/ sum (i^2 for i = 1 to n)
    def weight(casting_order):
        if not casting_order:
            return {}
        n = len(casting_order)
        weights = [(n-i) ** 2 for i in range(n)]
        sum_ = sum(weights)
        return {actor:w/sum_ for actor, w in zip(casting_order, weights)}
    
    graph = {movie:weight(actors) for movie, actors in graph.items() if actors}
    return graph

def oneway_to_bidirected_graph(graph):
    """Input: graph[movie][actor] = weight graph"""
    # bi-directed graph
    # graph has only one-way link: movie -> actor
    actor_weight_sum = {}

    # cumulate actor weights
    for movie, actors in graph.items():
        for actor, weight in actors.items():
            actor_weight_sum[actor] = actor_weight_sum.get(actor, 0) + weight

    # make bi-directed graph
    from collections import defaultdict
    g = defaultdict(lambda: {})
    for movie, actors in graph.items():
        g['movie {}'.format(movie)] = {'actor {}'.format(a):w for a,w in actors.items()}
        for actor, weight in actors.items():
            g['actor {}'.format(actor)]['movie {}'.format(movie)] = weight / actor_weight_sum[actor]

    g = dict(g)
    return g

def main():
    casting_csv_path = './data/casting.txt'
    graph_path = './data/casting_graph.pkl'

    graph = make_graph(casting_csv_path)

    import pickle
    with open(graph_path, 'wb') as f:
        pickle.dump(graph, f)

if __name__ == '__main__':
    main()

In [11]:
def pagerank(G, bias=None, df=0.15,
             max_iter=50, converge_error=0.001,verbose=0):
    """
    Arguments
    ---------
    G: Inbound graph, dict of dict
        G[to_node][from_node] = weight (float)
    df: damping factor, float. default 0.15
    """
    
    A, nodes_dict = _normalize(G)
    N = len(nodes_dict) # number of nodes
    sr = 1 - df # survival rate (1 -  damping factor)
    ir = 1 / N # initial rank

    # Initialization
    rank_dict = {n:ir for n in nodes_dict}

    # Initialization of bias
    if not bias:
        bias = {node:ir for node in nodes_dict}

    # Iteration
    for _iter in range(1, max_iter + 1):
        rank_dict_new = {}

        # t: to node, f: from node, w: weight
        for t in nodes_dict:
            f_dict = A.get(t, {})
            rank_dict_t = sum((w*rank_dict[f] for f, w in f_dict.items())) if f_dict else 0
            rank_dict_t = sr * rank_dict_t + df * bias.get(t, 0)
            rank_dict_new[t] = rank_dict_t

        # convergence check
        diff = sum((abs(rank_dict[n] - rank_dict_new[n]) for n in nodes_dict))
        if diff < converge_error:
            if verbose:
                print('Early stopped at iter = {}'.format(_iter))
            break

        if verbose:
            sum_ = sum(rank_dict_new.values())
            print('Iteration = {}, diff = {}, sum = {}'.format(_iter, diff, sum_))

        rank_dict = rank_dict_new

    return rank_dict


def _normalize(G):
    """It returns outbound normalized graph
    Arguments
    ---------
    G: inbound graph dict of dict
    """
    # Sum of outbound weight
    # t: to node, f: from node, w: weight
    W_sum = {}    
    for t, f_dict in G.items():
        for f, w in f_dict.items():
            W_sum[f] = W_sum.get(f, 0) + w
    A = {t:{f:w/W_sum[f] for f,w in f_dict.items()} for t, f_dict in G.items()}    
    nodes_dict = set(G.keys())
    nodes_dict.update(W_sum)
    return A, nodes_dict

In [12]:
bias = {node:(idx2numcomments(node.split()[1]) if node[0] == 'm' else 0) for node in g}
_sum = sum(bias.values())
bias = {node:b / _sum for node, b in bias.items()}

rank_dict = pagerank(g,
               bias = bias,
               df = 0.15,
               max_iter = 30,
               converge_error = 0.001,
               verbose = 1)

Iteration = 1, diff = 0.674593559403865, sum = 1.0000000000000044
Iteration = 2, diff = 0.5133755765513087, sum = 1.000000000000009
Iteration = 3, diff = 0.40708434710253083, sum = 1.0000000000000075
Iteration = 4, diff = 0.3288114569044877, sum = 1.0000000000000036
Iteration = 5, diff = 0.2690000626169722, sum = 1.000000000000007
Iteration = 6, diff = 0.22172923044566514, sum = 0.9999999999999898
Iteration = 7, diff = 0.18372765496993038, sum = 0.9999999999999948
Iteration = 8, diff = 0.15290648077655603, sum = 1.0000000000000073
Iteration = 9, diff = 0.12756391624362093, sum = 0.9999999999999956
Iteration = 10, diff = 0.10676563571706428, sum = 0.9999999999999949
Iteration = 11, diff = 0.08947335545631496, sum = 1.0000000000000067
Iteration = 12, diff = 0.07517014319662851, sum = 1.0000000000000093
Iteration = 13, diff = 0.06318528811144798, sum = 0.9999999999999916
Iteration = 14, diff = 0.05320609097840669, sum = 0.9999999999999908
Iteration = 15, diff = 0.044830477927067174, sum =

### [실습3] Numpy를 이용한 PageRank

In [None]:
import numpy as np
from scipy.sparse import csc_matrix

In [14]:
nodes = set(g.keys())
idx2node = list(sorted(nodes))
node2idx = {node:idx for idx, node in enumerate(idx2node)}

bias = np.asarray([b for node, b in sorted(bias.items(), key = lambda tp: node2idx[tp[0]])])
print(bias.shape)

rows = []
cols = []
data = []

for from_node, to_dict in g.items():
    from_idx = node2idx[from_node]
    for to_node, weight in to_dict.items():
        to_idx = node2idx[to_node]
        rows.append(from_idx)
        cols.append(to_idx)
        data.append(weight)

A = csc_matrix((data, (rows, cols)))
print(A.shape)

(6154,)
(6154, 6154)


In [17]:
max_iter = 30
df = 0.85

ir = 1 / A.shape[0]
rank = np.asarray([ir] * A.shape[0])

for n_iter in range(1, max_iter + 1):
    rank_new = A.dot(rank)
    rank_new = normalize(rank_new.reshape(1, -1), norm = 'l1').reshape(-1)
    rank_new = df * rank_new + (1 - df) * bias
    diff = abs(rank - rank_new).sum()
    rank = rank_new
    print('iter {} : diff = {}'.format(n_iter, diff))

iter 1 : diff = 0.1685245368865779
iter 2 : diff = 0.123534416788289
iter 3 : diff = 0.11717242074154521
iter 4 : diff = 0.08676250638774644
iter 5 : diff = 0.08106650827175174
iter 6 : diff = 0.06044614044638538
iter 7 : diff = 0.05589952786903922
iter 8 : diff = 0.04188475454126574
iter 9 : diff = 0.038452782327255894
iter 10 : diff = 0.0289095171904886
iter 11 : diff = 0.026405522194198443
iter 12 : diff = 0.01994486388644759
iter 13 : diff = 0.01811137289916391
iter 14 : diff = 0.013753287448751986
iter 15 : diff = 0.012408911428306675
iter 16 : diff = 0.009469243738374537
iter 17 : diff = 0.008494000468005527
iter 18 : diff = 0.006511648928942716
iter 19 : diff = 0.005809774127703195
iter 20 : diff = 0.004473307017566352
iter 21 : diff = 0.0039712967053357525
iter 22 : diff = 0.0030704578506105173
iter 23 : diff = 0.0027152845982687866
iter 24 : diff = 0.002106149459828414
iter 25 : diff = 0.0018577039374234091
iter 26 : diff = 0.0014438021951808503
iter 27 : diff = 0.001270456142