In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 39.6 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Selecting previously unselected package libxtst6:amd64.
(Reading database ... 123623 files and directories currently installed.)
Preparing to unpack .../libxtst6_2%3a1.2.3-1build4_amd64.deb ...
Unpacking libxtst6:amd64 (2:1.2.3-1build4) ...
Selecting previously unselected package openjdk-8-jre-headless:amd64.
Preparing to unpack .../openjdk-8-jre-headless_8u422-b05-1~22.04_amd64.deb ...
Unpacking openjdk-8-jre-headless:amd64 (8u422-b05-1~22.04) ...
Sel

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry, DenseMatrix

In [16]:
conf = SparkConf().set("spark.ui.port", "4050")

sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [17]:
small_data = sc.textFile('/content/graph-small.txt')
full_data = sc.textFile('/content/graph-full.txt')

LAMBDA = 1
NU = 1
source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct()
edges = source_dest_pair.map(lambda x: (x[0], x[1], 1))
edges_transpose = source_dest_pair.map(lambda x: (x[1], x[0], 1))
L = CoordinateMatrix(edges).toBlockMatrix()
L_transpose = CoordinateMatrix(edges_transpose).toBlockMatrix()

h_init = []

for i in range(1000):
  h_init.append((i, 0, 1))

In [18]:
h = CoordinateMatrix(sc.parallelize(h_init)).toBlockMatrix()
a = None

for i in range(40):

  a_new = L_transpose.multiply(h)
  a_new_max = np.max(np.array(a_new.toLocalMatrix().toArray()))
  a_new_max_inverse = []
  for j in range(1000):
    a_new_max_inverse.append((j, j, 1 / a_new_max))
  a_new_max_inverse = CoordinateMatrix(sc.parallelize(a_new_max_inverse)).toBlockMatrix()
  a = a_new_max_inverse.multiply(a_new)

  h_new = L.multiply(a)
  h_new_max = np.max(np.array(h_new.toLocalMatrix().toArray()))
  h_new_max_inverse = []
  for j in range(1000):
    h_new_max_inverse.append((j, j, 1 / h_new_max))
  h_new_max_inverse = CoordinateMatrix(sc.parallelize(h_new_max_inverse)).toBlockMatrix()
  h = h_new_max_inverse.multiply(h_new)

In [19]:
h_numpy = np.array(h.toLocalMatrix().toArray())
a_numpy = np.array(a.toLocalMatrix().toArray())
h_min_args = np.argsort(h_numpy, axis = 0)[:5]
a_min_args = np.argsort(a_numpy, axis = 0)[:5]
h_max_args = np.argsort(-h_numpy, axis = 0)[:5]
a_max_args = np.argsort(-a_numpy, axis = 0)[:5]

In [20]:
print("The 5 node ids with the highest hubbiness scores:")
for args in h_max_args:
  print("Node id: {}, hubbiness score: {}".format(args[0] + 1, h_numpy[args][0][0]))

The 5 node ids with the highest hubbiness scores:
Node id: 840, hubbiness score: 1.0
Node id: 155, hubbiness score: 0.9499618624906543
Node id: 234, hubbiness score: 0.8986645288972264
Node id: 389, hubbiness score: 0.863417110184379
Node id: 472, hubbiness score: 0.8632841092495217


In [21]:
print("The 5 node ids with the lowest hubbiness scores:")
for args in h_min_args:
  print("Node id: {}, hubbiness score: {}".format(args[0] + 1, h_numpy[args][0][0]))

The 5 node ids with the lowest hubbiness scores:
Node id: 23, hubbiness score: 0.042066854890936534
Node id: 835, hubbiness score: 0.05779059354433016
Node id: 141, hubbiness score: 0.06453117646225179
Node id: 539, hubbiness score: 0.06602659373418492
Node id: 889, hubbiness score: 0.07678413939216454


In [22]:
print("The 5 node ids with the highest authority scores:")
for args in a_max_args:
  print("Node id: {}, hubbiness score: {}".format(args[0] + 1, a_numpy[args][0][0]))

The 5 node ids with the highest authority scores:
Node id: 893, hubbiness score: 1.0
Node id: 16, hubbiness score: 0.9635572849634398
Node id: 799, hubbiness score: 0.9510158161074016
Node id: 146, hubbiness score: 0.9246703586198444
Node id: 473, hubbiness score: 0.899866197360405


In [23]:
print("The 5 node ids with the lowest authority scores:")
for args in a_min_args:
  print("Node id: {}, hubbiness score: {}".format(args[0] + 1, a_numpy[args][0][0]))

The 5 node ids with the lowest authority scores:
Node id: 19, hubbiness score: 0.05608316377607618
Node id: 135, hubbiness score: 0.06653910487622794
Node id: 462, hubbiness score: 0.07544228624641902
Node id: 24, hubbiness score: 0.08171239406816946
Node id: 910, hubbiness score: 0.08571673456144878


In [24]:
sc.stop()