In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=64e021db8a3ebdf468952dc03b565db316220ae6f29f578306b0f64929d7f687
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [48]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
import pandas as pd
import itertools

# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

In [49]:
data = sc.textFile("/content/browsing.txt")
session = data.map(lambda x: x.split())
support_threshold = 120
top_n = 5

In [50]:
def compute_frequent_items(session, support_threshold):
    freq_items_support = (session
        .flatMap(lambda x: [(item, 1) for item in x])
        .reduceByKey(lambda count1, count2: count1 + count2)
        .filter(lambda item: item[1] >= support_threshold)
        .sortByKey())

    return {item[0]: item[1] for item in freq_items_support.collect()}

def build_pairs(basket, freq_items):
    pairs = []
    if len(basket) < 2:
        return pairs

    for i, item1 in enumerate(basket[:-1]):
        for item2 in basket[i + 1:]:
            if all(item in freq_items for item in (item1, item2)):
                key = (item1, item2) if item1 < item2 else (item2, item1)
                val = [freq_items[item] for item in key] + [1]
                pairs.append((key, tuple(val)))
    return pairs

def compute_pair_support(session, freq_items):
    return (session
        .flatMap(lambda basket: build_pairs(basket, freq_items))
        .reduceByKey(lambda x, y: (x[0], x[1], x[2] + y[2])))

def compute_pair_confidence(pair_support):
    def pair_conf(pair):
        (i1, i2), (s1, s2, s12) = pair
        return [((i1, i2), s12 / s1),
                ((i2, i1), s12 / s2)]

    return pair_support.flatMap(pair_conf)

freq_items = compute_frequent_items(session, support_threshold)
print("Number of frequent item sets:", len(freq_items))

pair_support = compute_pair_support(session, freq_items)
conf_pair = compute_pair_confidence(pair_support)
sorted_pairs = sorted(conf_pair.collect(), key=lambda r: (-r[1], r[0][0]))
for rel in sorted_pairs[:top_n]:
  (a, b), conf = rel
  line = f"{a} -> {b} = {conf:.10f}"
  print(line)

Number of frequent item sets: 553
DAI93865 -> FRO40251 = 1.0000000000
GRO85051 -> FRO40251 = 0.9991762768
DAI88079 -> FRO40251 = 0.9867256637
FRO92469 -> FRO40251 = 0.9835100118
DAI43868 -> SNA82528 = 0.9729729730


In [51]:
def compute_frequent_pair_support(pair_support, support_threshold):
    return pair_support.filter(lambda x: x[1][2] >= support_threshold)

def build_triples(basket, freq_items, freq_pairs):
    triples = []
    if len(basket) < 3:
        return triples

    for i, item1 in enumerate(basket[:-2]):
        for j in range(i + 1, len(basket) - 1):
            item2 = basket[j]
            for item3 in basket[j + 1:]:
                triple = sorted([item1, item2, item3])

                if all(item in freq_items for item in triple):
                    pairs = [tuple(triple[:idx] + triple[idx + 1:]) for idx in range(len(triple))]

                    if all(pair in freq_pairs for pair in pairs):
                        val = [freq_pairs[pair] for pair in pairs] + [1]
                        triples.append((tuple(triple), tuple(val)))
    return triples

def compute_triple_confidence(rdd):
    (i1, i2, i3), (s12, s13, s23, s123) = rdd
    return [((i1, i2, i3), s123 / s12),
            ((i1, i3, i2), s123 / s13),
            ((i2, i3, i1), s123 / s23)]

freq_pair_support = compute_frequent_pair_support(pair_support, s)
freq_pairs = {x[0]: x[1][2] for x in freq_pair_support.collect()}

conf_triple = (session
    .flatMap(lambda basket: build_triples(basket, freq_items, freq_pairs))
    .reduceByKey(lambda x, y: (x[0], x[1], x[2], x[3] + y[3]))
    .flatMap(compute_triple_confidence))

sorted_triples = sorted(conf_triple.collect(), key=lambda r: (-r[1], r[0][0], r[0][1]))
for rel in sorted_triples[:top_n]:
            (a, b, c), conf = rel
            line = f"{a}, {b} -> {c} = {conf:.10f}"
            print(line)
sc.stop()

DAI23334, ELE92920 -> DAI62779 = 1.0000000000
DAI55911, GRO85051 -> FRO40251 = 1.0000000000
DAI75645, GRO85051 -> FRO40251 = 1.0000000000
ELE17451, GRO85051 -> FRO40251 = 1.0000000000
ELE20847, FRO92469 -> FRO40251 = 1.0000000000
