In [6]:
import math
import pandas as pd

In [3]:
inlinks_file = "../Resources/wt2g_inlinks.txt"
inlinks_dict = {}
inlinks_len = {}
outlinks_dict = {}
outlinks_len = {}

with open(inlinks_file, "r") as f:
    for line in f:
        line = line.strip()
        doc_id = line.split()[0]
        inlinks = line.split()[1:]
        inlinks_dict[doc_id] = inlinks
        inlinks_len[doc_id] = len(inlinks)

for source_doc_id in inlinks_dict:
    if inlinks_dict[source_doc_id]:
        for target_doc_id in inlinks_dict[source_doc_id]:
            if target_doc_id in outlinks_dict:
                outlinks_dict[target_doc_id].append(source_doc_id)
            else:
                outlinks_dict[target_doc_id] = [source_doc_id] 

for page in inlinks_dict:
    if page in outlinks_dict:
        if outlinks_dict[page]:
            outlinks_len[page] = len(outlinks_dict[page])
        else:
            outlinks_len[page] = 0
    else:
        outlinks_len[page] = 0

In [4]:
def page_rank(inlinks_dict, outlinks_dict, outlinks_len, max_iter, d=0.85):
    P = set(inlinks_dict.keys())
    N = len(P)
    PR = {page: 1/N for page in P}

    S = [page for page in P if page not in outlinks_dict]
    L = outlinks_len

    new_PR = {}
    perplexity = 0
    no_change_in_row = 0
    for i in range(max_iter):
        sink_PR = 0
        for page in S:
            sink_PR += PR[page]
        
        for page in P:
            new_PR[page] = (1 - d) / N
            new_PR[page] += d * sink_PR / N
            for q in inlinks_dict[page]:
                if q in PR and L[q] !=0:
                    new_PR[page] += d * PR[q] / L[q]    
        for page in P:
            PR[page] = new_PR[page]
        new_perplexity = 2 ** (-sum(PR[page] * math.log(PR[page],2) for page in P))
        
        print("Iteration: ", i, " Perplexity: ", new_perplexity)
        # Check for convergence
        if int(perplexity) == int(new_perplexity):
            no_change_in_row += 1
        else:
            no_change_in_row = 0
        perplexity = new_perplexity
        if no_change_in_row > 3:
            break

    return PR

def display_top_n(PR, n):
    sorted_PR = sorted(PR.items(), key=lambda x: x[1], reverse=True)[:n]
    print("Top ", n, " pages:")
    for page in sorted_PR:
        print(page[0], ":", page[1])

def save_top_n(PR, n, filename):
    sorted_PR = sorted(PR.items(), key=lambda x: x[1], reverse=True)[:n]
    for page in sorted_PR:
        with open(filename, "a") as f:
            f.write(page[0] + " " + str(page[1]) + " " + str(outlinks_len[page[0]]) + " " + str(inlinks_len[page[0]])+ "\n")

PR = page_rank(inlinks_dict, outlinks_dict, outlinks_len, max_iter=1000)

Iteration:  0  Perplexity:  79669.92319571749
Iteration:  1  Perplexity:  86267.67410240651
Iteration:  2  Perplexity:  72260.35360671108
Iteration:  3  Perplexity:  75132.4076592715
Iteration:  4  Perplexity:  68932.60291311225
Iteration:  5  Perplexity:  71197.83341083827
Iteration:  6  Perplexity:  67782.53778454349
Iteration:  7  Perplexity:  69379.57741406407
Iteration:  8  Perplexity:  67383.70755889159
Iteration:  9  Perplexity:  68477.80188343812
Iteration:  10  Perplexity:  67207.18479625342
Iteration:  11  Perplexity:  68004.15388367185
Iteration:  12  Perplexity:  67138.95537949666
Iteration:  13  Perplexity:  67708.25939079946
Iteration:  14  Perplexity:  67131.663934586
Iteration:  15  Perplexity:  67524.47691368092
Iteration:  16  Perplexity:  67132.11109106631
Iteration:  17  Perplexity:  67413.71012184795
Iteration:  18  Perplexity:  67138.8498145003
Iteration:  19  Perplexity:  67339.82543897966
Iteration:  20  Perplexity:  67149.7850061852
Iteration:  21  Perplexity: 

In [5]:
display_top_n(PR, 500)
save_top_n(PR, 500, "wt2g_PageRank.txt")

Top  500  pages:
WT21-B37-76 : 0.0026794094272144424
WT21-B37-75 : 0.0015259166438427877
WT25-B39-116 : 0.0014694947334658922
WT23-B21-53 : 0.0013723234635210201
WT24-B40-171 : 0.0012449987603104697
WT23-B39-340 : 0.0012403968885748439
WT23-B37-134 : 0.0012052153871083624
WT08-B18-400 : 0.0011435407139305793
WT13-B06-284 : 0.0011247805165849726
WT24-B26-46 : 0.001085045664876557
WT13-B06-273 : 0.0010447001198702235
WT01-B18-225 : 0.000988443620473868
WT04-B27-720 : 0.0009364071908723378
WT23-B19-156 : 0.0008942304358025193
WT04-B30-12 : 0.0008164407175334259
WT24-B26-10 : 0.0008074275567873409
WT25-B15-307 : 0.0008043822032741523
WT07-B18-256 : 0.0007748821192032926
WT24-B26-2 : 0.0007713413346801238
WT14-B03-220 : 0.0007163920205376202
WT24-B40-167 : 0.0007074602423228843
WT14-B03-227 : 0.0006849553116296611
WT18-B31-240 : 0.0006601893167221334
WT04-B40-202 : 0.000658703105894229
WT08-B19-222 : 0.0006434323149586116
WT27-B28-203 : 0.0006270012895766568
WT13-B15-160 : 0.000621296493314

In [10]:
# Load the crawl_page_rank into a pandas dataframe
df = pd.read_csv("wt2g_PageRank.txt", sep=" ", header=None)
df.columns = ["URL", "PageRank", "Outlinks", "Inlinks"]
df['PageRank'] = df['PageRank']*10000
df.head(20)

Unnamed: 0,URL,PageRank,Outlinks,Inlinks
0,WT21-B37-76,26.794094,5,2568
1,WT21-B37-75,15.259166,1,1704
2,WT25-B39-116,14.694947,1,169
3,WT23-B21-53,13.723235,1,198
4,WT24-B40-171,12.449988,209,270
5,WT23-B39-340,12.403969,396,274
6,WT23-B37-134,12.052154,2,208
7,WT08-B18-400,11.435407,0,1011
8,WT13-B06-284,11.247805,2,454
9,WT24-B26-46,10.850457,6,187
