In [24]:
import numpy as np
from scipy.sparse import csc_matrix
import time

# Hyper-parameters 
beta = 0.8

# Load IDs of page link origins and targets and all unqiue IDS
ID_From = np.load('ID_From.npy')
ID_To = np.load('ID_To.npy')

# Find unique page IDs
# These pages will be both the rows and columns of our matrix M
ID_Unique = np.unique(np.concatenate((ID_From, ID_To), axis=0))

# Count nonzero outgoing links
# This will serve as the denominator for the columns of our matrix M
# ie, assuming even probability 1 / (the total outgoing count) is the 
# proability of going to the page of a row from the page of a column
ID_Outgoing_Nonzero, ID_Outgoing_Count = np.unique(ID_From, return_counts=True)

# Create our matrix
M = csc_matrix((np.ones(len(ID_From)), (ID_To, ID_From)), shape=(len(ID_Unique), len(ID_Unique)))

# Normalize dividing columns by total count 
# Will use 1 if dead end - will still be 0 - no entry in sparse - but avoids nans
Norm = np.ones(len(ID_Unique))
Norm[ID_Outgoing_Nonzero] = ID_Outgoing_Count
val = np.repeat(Norm, M.getnnz(axis=0)) # This keeps the sparse matrix a sparse matrix
M.data /= val



In [36]:
# Create v with even inital distribution
start_time = time.time()
v_old = np.ones(len(ID_Unique))/len(ID_Unique)
iterations = 2
MSE = np.zeros(iterations)
for iteration in range(iterations):
    v_new = (np.ones(len(ID_Unique)) - beta)/len(ID_Unique)
    Mv = np.zeros(len(ID_Unique))
    for i in range(M.shape[1]):
        indice = csc_matrix.nonzero(M[:,i])
        d_i = csc_matrix.getnnz(M[:,i])
        for j in range(d_i):
            Mv[indice[0][j]] += beta * v_old[i]/d_i
            #v_new[indice[0][j]] += beta * v_old[i]/d_i
    MSE[iteration] = sum((v_old - Mv)**2)
    v_old = Mv + v_new
print(time.time() - start_time)

170.19312953948975


In [16]:
csc_matrix.getnnz(M[:,0])

193

In [20]:
csc_matrix.nonzero(M[:,0])[0]

array([   234,    240,   1365,   1707,   1910,   2446,   3072,   3182,
         3282,   3391,   3549,   4632,   4638,   4640,   4833,   4918,
         4956,   5688,   5745,   5747,   6074,   6086,   6287,   6949,
         7524,   7529,   8228,   8259,   8541,   9382,   9488,   9744,
        11205,  11470,  12197,  12210,  12244,  12245,  12616,  12935,
        13459,  13839,  13884,  13981,  14355,  14783,  15053,  16070,
        16362,  16450,  16495,  17081,  17256,  17257,  17595,  17631,
        17653,  17727,  17742,  17794,  17905,  18033,  18369,  18373,
        18381,  18382,  18402,  18413,  18421,  18425,  18459,  18461,
        18477,  18478,  18487,  18493,  18501,  18502,  18516,  18568,
        18591,  18592,  18643,  19179,  20289,  20379,  20426,  23394,
        23857,  24998,  25535,  25907,  25908,  26903,  27798,  28316,
        28321,  30898,  31416,  32427,  32969,  34899,  36029,  36895,
        37647,  38796,  39006,  41443,  43189,  43770,  46252,  47092,
      

In [37]:
MSE

array([0.00131206, 0.0005548 ])