In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -q

Reading package lists...
Building dependency tree...
Reading state information...
openjdk-8-jdk-headless is already the newest version (8u382-ga-1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [None]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from operator import itemgetter


In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
conf = SparkConf()
sc = SparkContext(conf=conf)
data = sc.textFile("/content/drive/My Drive/Maschine Learning/browsing.txt")


# Step 1: Split each line into individual items
items = data.map(lambda line: line.split())

# Step 2: Flatten the list of items within each transaction
flattened_items = items.flatMap(lambda item_list: item_list)

# Step 3: Count the support for each item
item_support = flattened_items.countByValue()

# Step 4: Filter items with support >= 100
frequent_items = [item for item, support in item_support.items() if support >= 100]

# Output for Step 4
print("Frequent Items (Support >= 100):")
for item in frequent_items:
    print(item)



In [None]:
# Step 5: Create pairs and compute their support
pairs = items.flatMap(lambda item_list: [(item1, item2) for item1 in item_list for item2 in item_list if item1 < item2])
pair_support = pairs.countByValue()

# Step 6: Filter pairs with support >= 100
frequent_pairs = [(pair, support) for pair, support in pair_support.items() if support >= 100]

# Output for Step 6
print("\nFrequent Item Pairs (Support >= 100):")
for pair, support in frequent_pairs:
    print(f"Pair: {pair}, Support: {support}")

In [None]:
# Step 7: Compute and sort association rules
association_rules = []

for pair, support in frequent_pairs:
    item1, item2 = pair

    # Compute confidence for X => Y
    confidence_X_to_Y = support / item_support[item1]
    association_rules.append(((item1, item2), confidence_X_to_Y))

    # Compute confidence for Y => X
    confidence_Y_to_X = support / item_support[item2]
    association_rules.append(((item2, item1), confidence_Y_to_X))

# Sort association rules by confidence
sorted_rules = sorted(association_rules, key=lambda x: x[1], reverse=True)

# Output for Step 7
print("\nTop Association Rules (Sorted by Confidence):")
for idx, (rule, confidence) in enumerate(sorted_rules[:10] ):
    item_X, item_Y = rule
    print(f"Rule {idx + 1}: If a customer browses '{item_X}', they will also browse '{item_Y}' with confidence: {confidence:.3f}")

sc.stop()


Top Association Rules (Sorted by Confidence):
Rule 1: If a customer browses 'DAI93865', they will also browse 'FRO40251' with confidence: 1.000
Rule 2: If a customer browses 'GRO85051', they will also browse 'FRO40251' with confidence: 0.999
Rule 3: If a customer browses 'GRO38636', they will also browse 'FRO40251' with confidence: 0.991
Rule 4: If a customer browses 'ELE12951', they will also browse 'FRO40251' with confidence: 0.991
Rule 5: If a customer browses 'DAI88079', they will also browse 'FRO40251' with confidence: 0.987
Rule 6: If a customer browses 'FRO92469', they will also browse 'FRO40251' with confidence: 0.984
Rule 7: If a customer browses 'DAI43868', they will also browse 'SNA82528' with confidence: 0.973
Rule 8: If a customer browses 'DAI23334', they will also browse 'DAI62779' with confidence: 0.955
Rule 9: If a customer browses 'ELE92920', they will also browse 'DAI62779' with confidence: 0.733
Rule 10: If a customer browses 'DAI53152', they will also browse 'FRO40