## **Configurations**

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=7a08f4de2200e85eeb52cdd6bd51170777e9b798544c39cfb86a5a60ef0feba4
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [6]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## **Q2**   

In [10]:
# rdd = sc.textFile("graph-small.txt")    
rdd = sc.textFile("graph-full.txt")    

In [None]:
edges = rdd.map(lambda x: x.split())\
            .map(lambda x: (int(x[0]), int(x[1])))\
            .distinct()       

# edges.take(5)      

In [12]:
n = 1000 
max_iteration = 40       
h = [(i, 1) for i in range(1, n + 1)] # h: (h_page, h_score)             

In [13]:
for i in range(max_iteration):  
    h = sc.parallelize(h)         
        
    h_dict = h.collectAsMap()  
    # (a_page, h_score) (a_page, sum ofh_score = updated_a_scor) 
    update_a = edges.map(lambda x: (x[1], h_dict[x[0]])).reduceByKey(lambda x, y: x + y)            
    max_a = update_a.map(lambda x: x[1]).max()
    a = update_a.map(lambda x: (x[0], x[1] / max_a)) # scaling  a: (a_page, a_score) 

    a_dict = a.collectAsMap()
    update_h = edges.map(lambda x: (x[0], a_dict[x[1]])).reduceByKey(lambda x, y: x + y)       
    max_h = update_h.map(lambda x: x[1]).max()
    h = update_h.map(lambda x: (x[0], x[1] / max_h))  

    h = h.collect()        

In [14]:
h = sc.parallelize(h)    
  
sorted_h = h.sortBy(lambda x: x[1], ascending=False)
top_five_h = sorted_h.take(5)
bottom_five_h = sorted_h.takeOrdered(5, key=lambda x: x[1])
   
sorted_a = a.sortBy(lambda x: x[1], ascending=False)
top_five_a = sorted_a.take(5)
bottom_five_a = sorted_a.takeOrdered(5, key=lambda x: x[1])

print("the 5 node ids with the highest hubbiness score: ", "\n", top_five_h)  
print("the 5 node ids with the lowest hubbiness score: ", "\n", bottom_five_h)
print("the 5 node ids with the highest authority score: ", "\n", top_five_a)    
print("the 5 node ids with the lowest authority score: ", "\n", bottom_five_a)

the 5 node ids with the highest hubbiness score:  
 [(840, 1.0), (155, 0.9499618624906541), (234, 0.8986645288972266), (389, 0.8634171101843793), (472, 0.8632841092495219)]
the 5 node ids with the lowest hubbiness score:  
 [(23, 0.042066854890936534), (835, 0.05779059354433016), (141, 0.0645311764622518), (539, 0.06602659373418493), (889, 0.07678413939216454)]
the 5 node ids with the highest authority score:  
 [(893, 1.0), (16, 0.9635572849634398), (799, 0.9510158161074017), (146, 0.9246703586198444), (473, 0.8998661973604051)]
the 5 node ids with the lowest authority score:  
 [(19, 0.05608316377607618), (135, 0.06653910487622795), (462, 0.07544228624641901), (24, 0.08171239406816945), (910, 0.08571673456144878)]
