## **Configurations**

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=2e625cbcd9c83b156e7a580a05aa6e263348c11a499673da6878ef2c0bfa5a25
  Stored in directory: /root/.cache/pip/wheels/b1/59/a0/a1a0624b5e865fd389919c1a10f53aec9b12195d6747710baf
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

## **People You Might Know**

In [6]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [7]:
rdd = spark.sparkContext.textFile("ego-facebook.txt")
rdd1 = rdd.map(lambda x:(int(x.split()[0]), int(x.split()[1])))
rdd2 = rdd1.map(lambda x: (x[1], x[0]))
rdd3 = rdd1.union(rdd2)

In [8]:
rdd4 = rdd3.groupByKey().mapValues(list).sortByKey(True, 1) 

In [9]:
# sort the list of friends so that the id of friends are listed in ascending order
def sort_friends(elmt):
  person = elmt[0]
  friends = elmt[1]
  friends_ = sorted(friends)

  return (person, friends_)

In [10]:
#each element in rdd4: (person, [friends])
#(the id of friends are listed in ascending order)
rdd4 = rdd4.map(sort_friends) 

In [11]:
# to get a list (all_people) of ids of all people mentioned in this problem
rdd_keys = rdd1.keys()
rdd_values = rdd1.values()
rdd_all = rdd_keys.union(rdd_values)
all_people = rdd_all.distinct().collect()
all_people.sort()

In [12]:
# to get the ids of all strangers
def not_friends(elmt):
  person = elmt[0] 
  friends = elmt[1] #friends of this person: list

  strangers = all_people.copy()
  strangers.remove(person)

  for i in friends:
    strangers.remove(i)

  return((person,friends,strangers))

In [13]:
# each element of rdd5: (person, [friends], [strangers])
rdd5 = rdd4.map(not_friends) 

In [None]:
# rdd5.collect()

In [15]:
rdd4_list = rdd4.collect()

In [16]:
# to get the numbers of mutual friends between the user and the stranger
def mutual_friends(elmt):
  person = elmt[0]
  friends = elmt[1] #friends of this person: list 
  strangers = elmt[2]

  strangers_left = strangers.copy()

  mutual_numbers = []

  for i in strangers: # i: each person of strangers
    for j in rdd4_list:
      if j[0] == i:
        friends_of_stranger = j[1]
    mutual = set(friends)&set(friends_of_stranger)
    if len(mutual) == 0:
      strangers_left.remove(i)
    else:
      mutual_numbers.append(len(mutual))

  mutual_strangers = list(zip(mutual_numbers, strangers_left))

  return(person, mutual_strangers)  

In [17]:
# each element of rdd6: 
#(person, [(the number of mutual friends, stranger i)...])
rdd6 = rdd5.map(mutual_friends)

In [20]:
# to get the ids of top 10 recommendation
def recommendation(elmt):
  strangers_mutual = elmt[1]
  strangers_mutual_sorted = sorted(strangers_mutual, key=lambda x:x[0],reverse = True)
  strangers_mutual_sorted_10 = strangers_mutual_sorted[:10] #10

  recommendation_list = []
  for i in strangers_mutual_sorted_10:
    recommendation_list.append(i[1])

  recommendation_str = ','.join(str(item) for item in recommendation_list)

  s = str(elmt[0]) + "\t"
  s += recommendation_str
  
  return s

In [21]:
rdd7 = rdd6.map(recommendation)

In [None]:
# rdd7.collect()

In [22]:
rdd7.saveAsTextFile("output")