# Install Spark

In [0]:
# Install latest version of spark. If error, check the latest and replace "spark-2.4.4"
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
import findspark
findspark.init()

In [8]:
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Part 1. Friendship Recommendation
Modify friends.py provided

In [0]:
from pyspark import SparkConf, SparkContext
import pyspark
import sys
from collections import defaultdict

In [0]:
# Finished. Return RDD
def getData(sc, filename):
    """
    Load data from raw text file into RDD and transform.
    Hint: transfromation you will use: map(<lambda function>).
    Args:
        sc (SparkContext): spark context.
        filename (string): hw2.txt cloud storage URI.
    Returns:
        RDD: RDD list of tuple of (<User>, [friend1, friend2, ... ]),
        each user and a list of user's friends
    """
    # read text file into RDD
    data = sc.textFile(filename)

    # TODO: implement your logic here
    data = data.map(lambda line: np.array([str(x) for x in line.split('\t')]))
    data = data.map(lambda p:(int(p[0]), p[1].split(',')))

    return data

## mapFriends 对间接的启用双向，那直接的需要吗？！还有，会有重复的吗？～

In [0]:
def mapFriends(line):
    """
    List out every pair of mutual friends, also record direct friends.
    Hint:
    For each <User>, record direct friends into a list:
    [(<User>, (friend1, 0)),(<User>, (friend2, 0)), ...],
    where 0 means <User> and friend are already direct friend,
    so you don't need to recommand each other.

    For friends in the list, each of them has a friend <User> in common,
    so for each of them, record mutual friend in both direction:
    (friend1, (friend2, 1)), (friend2, (friend1, 1)),
    where 1 means friend1 and friend2 has a mutual friend <User> in this "line"

    There are possibly multiple output in each input line,
    we applied flatMap to flatten them when using this function.
    Args:
        line (tuple): tuple in data RDD
    Yields:
        RDD: rdd like a list of (A, (B, 0)) or (A, (C, 1))
    """
    friends = line[1]
    user = line[0]
    for i in range(len(friends)):
        # Direct friend
        # TODO: implement your logic here
        yield((user,(int(friends[i]),0)))

        for j in range(i+1, len(friends)):
            # Mutual friend in both direction
            # TODO: implement your logic here
            yield((int(friends[i]), (int(friends[j]),1)))
            yield((int(friends[j]), (int(friends[i]),1)))

In [0]:
def findMutual(line):
    """
    Find top 10 mutual friend for each person.
    Hint: For each <User>, input is a list of tuples of friend relations,
    whether direct friend (count = 0) or has friend in common (count = 1)

    Use friendDict to store the number of mutual friend that the current <User>
    has in common with each other <User> in tuple.
    Input:(User1, [(User2, 1), (User3, 1), (User2, 1), (User3, 0), (User2, 1)])
    friendDict stores: {User2:3, User3:1}
    directFriend stores: User3

    If a user has many mutual frineds and is not a direct frined, we recommend
    them to be friends.

    Args:
        line (tuple): a tuple of (<User1>, [(<User2>, 0), (<User3>, 1)....])
    Returns:
        RDD of tuple (line[0], returnList),
        returnList is a list of recommended friends
    """
    # friendDict, Key: user, value: count of mutual friends
    friendDict = defaultdict(int)
    # set of direct friends
    directFriend = set()
    # initialize return list
    returnList = []

    # TODO: Iterate through input to aggregate counts
    # save to friendDict and directFriend
    user = line[0]
    friends = line[1]
    for i in range(len(friends)):
      if friends[i][1] == 0:
        directFriend.add(friends[i][0])
      else:
        friendDict[friends[i][0]] = friendDict.get(friends[i][0],0) + 1
        

    # TODO: Formulate output


    return (line[0], returnList)

In [0]:
#def main():
# Configure Spark
conf = SparkConf()
sc = SparkContext.getOrCreate(conf=conf)
# The directory for the file
filename = "/content/gdrive/My Drive/BigData/q1.txt"

In [0]:
# Get data in proper format
data = getData(sc, filename)

In [17]:
# test mapFriends()
# data.flatMap(mapFriends).take(5)

[(0, (1, 0)), (1, (2, 1)), (2, (1, 1)), (1, (3, 1)), (3, (1, 1))]

In [8]:
# Get set of all mutual friends
mapData = data.flatMap(mapFriends).groupByKey()
mapData.take(5)

[('17061', <pyspark.resultiterable.ResultIterable at 0x7f8530d916a0>),
 ('16643', <pyspark.resultiterable.ResultIterable at 0x7f85313355c0>),
 ('23869', <pyspark.resultiterable.ResultIterable at 0x7f8530546b38>),
 ('14871', <pyspark.resultiterable.ResultIterable at 0x7f8530d91400>),
 ('43832', <pyspark.resultiterable.ResultIterable at 0x7f85304e4b70>)]

In [0]:




# For each person, get top 10 mutual friends
getFriends = mapData.map(findMutual)

# Only save the ones we want
wanted = [924, 8941, 8942, 9019, 49824, 13420, 44410, 8974, 5850, 9993]
result = getFriends.filter(lambda x: x[0] in wanted).collect()

sc.stop()