In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 39.6 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Selecting previously unselected package libxtst6:amd64.
(Reading database ... 123623 files and directories currently installed.)
Preparing to unpack .../libxtst6_2%3a1.2.3-1build4_amd64.deb ...
Unpacking libxtst6:amd64 (2:1.2.3-1build4) ...
Selecting previously unselected package openjdk-8-jre-headless:amd64.
Preparing to unpack .../openjdk-8-jre-headless_8u422-b05-1~22.04_amd64.deb ...
Unpacking openjdk-8-jre-headless:amd64 (8u422-b05-1~22.04) ...
Sel

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry, DenseMatrix

In [15]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [16]:
small_data = sc.textFile('/content/graph-small.txt')
full_data = sc.textFile('/content/graph-full.txt')
BETA = 0.8
source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct()
edges = source_dest_pair.map(lambda x: (x[1], x[0], 1))
degrees = source_dest_pair.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y).map(lambda x: (x[0], x[0], 1 / x[1]))
edge_matrix = CoordinateMatrix(edges).toBlockMatrix()
degree_inverse_matrix = CoordinateMatrix(degrees).toBlockMatrix()

M = edge_matrix.multiply(degree_inverse_matrix)

In [17]:
r_init = []
beta_init = []
teleport_init = []
for i in range(1000):
  r_init.append((i, 0, 1 / 1000))
  beta_init.append((i, i, BETA))
  teleport_init.append((i, 0, (1 - BETA) / 1000))

In [18]:
r = CoordinateMatrix(sc.parallelize(r_init)).toBlockMatrix()
beta = CoordinateMatrix(sc.parallelize(beta_init)).toBlockMatrix()
teleport = CoordinateMatrix(sc.parallelize(teleport_init)).toBlockMatrix()

In [19]:
for i in range(40):
  r = teleport.add(beta.multiply(M).multiply(r))

r_numpy = np.array(r.toLocalMatrix().toArray())
min_args = np.argsort(r_numpy, axis = 0)[:5]
max_args = np.argsort(-r_numpy, axis = 0)[:5]

In [20]:
print("The top 5 node ids with the PageRank scores:")
for args in max_args:
  print("Node id: {}, PageRank score: {}".format(args[0] + 1, r_numpy[args][0][0]))

The top 5 node ids with the PageRank scores:
Node id: 263, PageRank score: 0.002020291181518219
Node id: 537, PageRank score: 0.00194334157145315
Node id: 965, PageRank score: 0.0019254478071662631
Node id: 243, PageRank score: 0.0018526340162417312
Node id: 285, PageRank score: 0.0018273721700645142


In [21]:
print("The bottom 5 node ids with the PageRank scores:")
for args in min_args:
  print("Node id: {}, PageRank score: {}".format(args[0] + 1, r_numpy[args][0][0]))

The bottom 5 node ids with the PageRank scores:
Node id: 558, PageRank score: 0.0003286018525215297
Node id: 93, PageRank score: 0.0003513568937516577
Node id: 62, PageRank score: 0.00035314810510596274
Node id: 424, PageRank score: 0.00035481538649301454
Node id: 408, PageRank score: 0.00038779848719291705


In [22]:
sc.stop()