## **Configurations**

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
openjdk-8-jdk-headless is already the newest version (8u362-ga-0ubuntu1~20.04.1).
0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.


In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

## **（C）**

In [6]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [7]:
rdd = sc.textFile("browsing.txt")

In [8]:
item_distinct = rdd.map(lambda x: list(set(x.split())))
#item_distinct.take(5)  

item_distinct_ = item_distinct.flatMap(lambda x: x).map(lambda x: (x, 1))
#item_distinct_.take(10)

In [9]:
frequent_items_counts = item_distinct.flatMap(lambda x: x)\
                              .map(lambda x: (x, 1))\
                              .reduceByKey(lambda x, y: x + y)\
                              .filter(lambda x: x[1] >= 100)
# frequent_items_counts.take(5)
# frequent_items_counts.count()

In [10]:
frequent_items_counts_dict = frequent_items_counts.collectAsMap()

In [11]:
frequent_items_list = frequent_items_counts.map(lambda x: x[0])\
                                      .collect()

In [12]:
import itertools
def filter_pairs(basket, frequent, k):

    intersection = sorted(list(set(basket) & set(frequent)))
  
    combination_list = list(itertools.combinations(intersection, k))

    return combination_list

In [13]:
frequent_pairs_counts = item_distinct.flatMap(lambda basket: filter_pairs(basket, frequent_items_list, 2))\
                                     .map(lambda x: (tuple(x),1))\
                                     .reduceByKey(lambda x, y: x + y)\
                                     .filter(lambda x: x[1] >= 100)
#frequent_pairs_counts.take(5)                                    

In [14]:
frequent_pairs_list = frequent_pairs_counts.map(lambda x: x[0])\
                                           .collect()
#frequent_pairs_list[:10]                                    

In [15]:
frequent_pairs_basket = item_distinct.map(lambda basket: filter_pairs(basket, frequent_items_list, 2))\
                                     .map(lambda x: sorted(list(set(x) & set(frequent_pairs_list))))

#frequent_pairs_basket.take(5)

In [16]:
answer_c = frequent_pairs_counts.flatMap(lambda x: (((x[0][0],x[0][1]),x[1]), ((x[0][1],x[0][0]), x[1])))\
                                        .map(lambda x: (x[0], x[1]/frequent_items_counts_dict[x[0][0]])).sortBy(lambda x: x[1], False) 
answer_c.take(5)

[(('DAI93865', 'FRO40251'), 1.0),
 (('GRO85051', 'FRO40251'), 0.999176276771005),
 (('GRO38636', 'FRO40251'), 0.9906542056074766),
 (('ELE12951', 'FRO40251'), 0.9905660377358491),
 (('DAI88079', 'FRO40251'), 0.9867256637168141)]

## **（d）**

In [17]:
frequent_pairs_counts_dict = frequent_pairs_counts.collectAsMap()

In [18]:
def combine_frequent_pairs(lst):
  triples = set()

  for i in range(len(lst)):
    set_A = set(lst[i])
    print("set_A: ",set_A)
    for j in range(i+1, len(lst)):
      set_B = set(lst[j])

      new_set = set(lst[i])|set(lst[j])

      if len(new_set) == 3:
        common_ele = set_A & set_B
        new_pair_tuple = tuple(sorted(list(set((set_A-common_ele)|(set_B-common_ele)))))

        print("new_pair_tuple: ",new_pair_tuple)
        
        if new_pair_tuple in frequent_pairs_list:
          new_triple = tuple(sorted(list(new_set)))

          triples.add(new_triple)

  return list(triples)

In [None]:
frequent_triples_counts = frequent_pairs_basket.map(lambda x: combine_frequent_pairs(x))\
                                               .filter(lambda x: x != [] and x != None)\
                                               .flatMap(lambda x: x)\
                                               .map(lambda x: (x, 1))\
                                               .reduceByKey(lambda x, y: x + y)\
                                               .filter(lambda x: x[1] >= 100)                                   
#frequent_triples_counts.take(5)      

In [20]:
answer_d = frequent_triples_counts.flatMap(lambda x: (((x[0][0],x[0][1]),x[0][2],x[1]), ((x[0][0],x[0][2]),x[0][1],x[1]), ((x[0][1],x[0][2]),x[0][0],x[1])))\
                                .map(lambda x : (x[0],x[1],float(x[2])/float(frequent_pairs_counts_dict[x[0]]))).sortBy(lambda x: (-x[2],x[0],x[1])) 

answer_d.take(5)  

[(('DAI23334', 'ELE92920'), 'DAI62779', 1.0),
 (('DAI31081', 'GRO85051'), 'FRO40251', 1.0),
 (('DAI55911', 'GRO85051'), 'FRO40251', 1.0),
 (('DAI62779', 'DAI88079'), 'FRO40251', 1.0),
 (('DAI75645', 'GRO85051'), 'FRO40251', 1.0)]