In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u422-b05-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [70]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
import pandas as pd

# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

In [71]:
def extract_friends(lines):
  lines = lines.map(lambda line:line.split())
  friends = lines.filter(lambda x:len(x)==2).map(lambda x:(x[0],x[1].split(",")))
  return friends

In [72]:
def create_map(friends):
  direct_connections = friends.flatMap(lambda entry: [((entry[0], friend), -999999) for friend in entry[1]])
  mutual_connections = friends.flatMap(lambda entry: [(pair, 1) for pair in itertools.permutations(entry[1], 2)])
  return direct_connections.union(mutual_connections)

In [73]:
def get_sort_recommendation(combined_connections):
  recommendations = combined_connections.reduceByKey(lambda x,y:x+y)
  recommendations = recommendations.filter(lambda x: x[1] > 0).map(lambda x: (x[0][0], (x[1], x[0][1]))).groupByKey().mapValues(list)
  recommendations = recommendations.map(lambda x: (x[0], sorted(x[1], key=lambda x: (-x[0], int(x[1]))))).map(lambda x: (x[0], x[1][:10])).map(lambda x: (x[0], [i[1] for i in x[1]]))
  return recommendations.collect()

In [74]:
lines = sc.textFile("/content/soc-LiveJournal1Adj.txt")
friends = extract_friends(lines)
combined_connections = create_map(friends)
recommendation = get_sort_recommendation(combined_connections)
recommendation_ouput = list(map(lambda x: str(x[0]) + "\t" + ",".join(map(str, x[1])), recommendation))
sc.parallelize(recommendation_ouput).repartition(1).saveAsTextFile("output_complete_recommendation")

In [75]:
user_ids = ['11','8997', '2791', '4985', '8961', '4049', '5060', '739', '1724', '9550', '3151']
recommendation_rdd = sc.parallelize(recommendation_ouput)
filtered_active_rdd = recommendation_rdd.filter(lambda x: x.split("\t")[0] in user_ids)
filtered_active = filtered_active_rdd.collect()
for rec in filtered_active:
    user_id, friends = rec.split("\t")
    print(f"{user_id}\t{friends.replace(',', ', ')}\n")
sc.stop()

9550	9554, 9533, 9544, 9558, 153, 1220, 1421, 1436, 1951, 2413

8997	8998, 8987, 8992, 9001, 9003, 9009, 4849, 7174, 7279, 7364

4985	79, 577, 4839, 4984, 4986, 4987, 4988, 4989, 4990, 4991

4049	4871, 4875, 4889, 8492, 8685, 439, 660, 1100, 1137, 1156

2791	21185, 8783, 13280, 18359, 18363, 23667, 35740, 2204, 2786, 5996

3151	3161, 43162, 3230, 3450, 8692, 161, 2036, 3136, 3137, 3162

1724	1711, 1663, 1712, 1718, 1662, 1697, 1700, 1715, 1716, 1658

5060	5052, 5057, 5086, 14271, 98, 364, 575, 596, 611, 622

11	27552, 7785, 27573, 27574, 27589, 27590, 27600, 27617, 27620, 27667

8961	12241, 8973, 8965, 8963, 8966, 8967, 7174, 8969, 12243, 7177

739	732, 367, 381, 336, 21526, 28064, 677, 704, 728, 736

