# Install Spark

In [0]:
# Install latest version of spark. If error, check the latest and replace "spark-2.4.4"
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Part 1. Friendship Recommendation
Modify friends.py provided

In [0]:
from pyspark import SparkConf, SparkContext
import pyspark
import sys
from collections import defaultdict

In [0]:
# Finished. Return RDD
def getData(sc, filename):
    """
    Load data from raw text file into RDD and transform.
    Hint: transfromation you will use: map(<lambda function>).
    Args:
        sc (SparkContext): spark context.
        filename (string): hw2.txt cloud storage URI.
    Returns:
        RDD: RDD list of tuple of (<User>, [friend1, friend2, ... ]),
        each user and a list of user's friends
    """
    # read text file into RDD
    data = sc.textFile(filename)

    # TODO: implement your logic here
    data = data.map(lambda line: np.array([str(x) for x in line.replace('\n','').split('\t')]))
    data = data.map(lambda p:(int(p[0]), p[1].split(',')))

    return data

In [0]:
def mapFriends(line):
    """
    List out every pair of mutual friends, also record direct friends.
    Hint:
    For each <User>, record direct friends into a list:
    [(<User>, (friend1, 0)),(<User>, (friend2, 0)), ...],
    where 0 means <User> and friend are already direct friend,
    so you don't need to recommand each other.

    For friends in the list, each of them has a friend <User> in common,
    so for each of them, record mutual friend in both direction:
    (friend1, (friend2, 1)), (friend2, (friend1, 1)),
    where 1 means friend1 and friend2 has a mutual friend <User> in this "line"

    There are possibly multiple output in each input line,
    we applied flatMap to flatten them when using this function.
    Args:
        line (tuple): tuple in data RDD
    Yields:
        RDD: rdd like a list of (A, (B, 0)) or (A, (C, 1))
    """
    friends = line[1]
    user = line[0]

    if friends != ['']:
        for i in range(len(friends)):
            # Direct friend
            # TODO: implement your logic here
            yield((user,(int(friends[i]),0)))
            yield((int(friends[i]),(user,0)))

            for j in range(i+1, len(friends)):
                # Mutual friend in both direction
                # TODO: implement your logic here
                yield((int(friends[i]), (int(friends[j]),1)))
                yield((int(friends[j]), (int(friends[i]),1)))

In [0]:
def findMutual(line):
    """
    Find top 10 mutual friend for each person.
    Hint: For each <User>, input is a list of tuples of friend relations,
    whether direct friend (count = 0) or has friend in common (count = 1)

    Use friendDict to store the number of mutual friend that the current <User>
    has in common with each other <User> in tuple.
    Input:(User1, [(User2, 1), (User3, 1), (User2, 1), (User3, 0), (User2, 1)])
    friendDict stores: {User2:3, User3:1}
    directFriend stores: User3

    If a user has many mutual frineds and is not a direct frined, we recommend
    them to be friends.

    Args:
        line (tuple): a tuple of (<User1>, [(<User2>, 0), (<User3>, 1)....])
    Returns:
        RDD of tuple (line[0], returnList),
        returnList is a list of recommended friends
    """
    # friendDict, Key: user, value: count of mutual friends
    friendDict = defaultdict(int)
    # set of direct friends
    directFriend = set()
    # initialize return list
    returnList = []

    # TODO: Iterate through input to aggregate counts
    # save to friendDict and directFriend
    user = line[0]
    friends = list(line[1])
    for i in range(len(friends)):
        len(friends[i])
        if friends[i][1] == 0:
            directFriend.add(friends[i][0])
        else:
            friendDict[friends[i][0]] = friendDict.get(friends[i][0],0) + 1

    # TODO: Formulate output
    sorted_friendDict = sorted(friendDict.items(), key = lambda x:x[1], reverse=True)
    for i in sorted_friendDict:
        if len(returnList) == 10:
            break
        elif i[0] in directFriend:
            continue
        else:
            returnList.append(i[0])

    return (line[0], returnList)

In [0]:
#def main():
# Configure Spark
conf = SparkConf()
sc = SparkContext.getOrCreate(conf=conf)
# The directory for the file
filename = "/content/gdrive/My Drive/BigData/q1.txt"

In [0]:
# Get data in proper format
data = getData(sc, filename)

In [0]:
data.take(5)

In [85]:
# test mapFriends()
data.flatMap(mapFriends).take(5)

[(0, (1, 0)), (1, (0, 0)), (1, (2, 1)), (2, (1, 1)), (1, (3, 1))]

In [0]:
# Get set of all mutual friends
mapData = data.flatMap(mapFriends).groupByKey()
# mapData.take(5)

In [87]:
# For each person, get top 10 mutual friends
getFriends = mapData.map(findMutual)
getFriends.take(5)

[(43030, [439, 14064, 29688, 14207, 18163, 7476, 2017, 1689, 45235, 5670]),
 (42548, [42547, 14319, 35875, 17478, 4736, 12438, 12715, 10680, 5811, 13621]),
 (13420, [7651, 14264, 4736, 10469, 8508, 8711, 8810, 10500, 10532, 24140]),
 (48868,
  [48837, 48892, 48816, 48833, 48834, 48849, 48852, 48924, 48818, 48823]),
 (7166, [45981, 45974, 45975, 2986, 3703, 18344, 25747, 25801, 41778, 8959])]

In [0]:
# Only save the ones we want
wanted = [924, 8941, 8942, 9019, 49824, 13420, 44410, 8974, 5850, 9993]
result = getFriends.filter(lambda x: x[0] in wanted).collect()

In [89]:
for i in result:
    print(i)

(8942, [8939, 8940, 8943, 8944])
(44410, [4231, 44462, 351, 6318, 9095, 10328, 10462, 12210, 13238, 14052])
(8974, [8960, 12241, 8774, 6973, 8969, 8980, 8982, 8984, 8978, 8979])
(5850, [5819, 5805, 5811, 5815, 5828, 5831, 5836, 219, 5804, 5806])
(924, [6995, 439, 2409, 11860, 15416, 43748, 45881])
(49824, [49846, 43382, 41581, 49786, 49788, 49789, 49814, 49819, 49834, 53])
(13420, [7651, 14264, 4736, 10469, 8508, 8711, 8810, 10500, 10532, 24140])
(8941, [8943, 8944, 8940])
(9993, [9991, 13134, 13478, 13877, 34299, 34485, 34642, 37941])
(9019, [9022, 317, 9023])


In [0]:
sc.stop()

5850
13420
44410
49824

# test split

In [0]:
textdata = []
for line in open(filename,'r').readlines():
    tmp = line.replace('\n','')
    tmp = tmp.split('\t')
    i0 = int(tmp[0])
    i1 = tmp[1].split(',')
    if i1 == ['']:
        continue
    else:
        try:
            i1 = [int(x) for x in i1]
        except ValueError:
            print("value error, input", i0,',friends:', i1,"len,",len(i[1]))
    textdata.append((i0,i1))

In [70]:
np.shape(textdata)

(1, 2)

In [71]:
textdata

[(0,
  [1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   75,
   76,
   77,
   78,
   79,
   80,
   81,
   82,
   83,
   84,
   85,
   86,
   87,
   88,
   89,
   90,
   91,
   92,
   93,
   94])]