In [None]:
import pandas as pd
from itertools import combinations  # Added missing import
import warnings
warnings.filterwarnings('ignore')

# Load raw lines manually (skip header)
try:
    with open("/content/drive/MyDrive/DataWarehouseDataMining/space.txt", "r") as f:
        lines = f.readlines()[1:]  # skip header line
except FileNotFoundError:
    print("Error: 'sports.txt' not found. Please check the file path.")
    raise

# Parse each line manually
transactions_list = []
for line in lines:
    parts = line.strip().split(",")
    items = [item.strip() for item in parts[1:] if item.strip()]  # skip TransactionID, ignore empty items
    if items:  # only add non-empty transactions
        transactions_list.append(items)

# Check a few parsed transactions
print("Sample transactions:", transactions_list[:5])



Sample transactions: [['Robotic Arm', 'Food Packets'], ['Sleeping Bag', 'Treadmill', 'Food Packets'], ['Robotic Arm', 'Space Suit', '3D Printer'], ['Food Packets', 'Carbon Dioxide Scrubbers'], ['Sleeping Bag', 'Space Suit']]


In [None]:
def get_support(item, transactions):
    count = sum(1 for transaction in transactions if set(item).issubset(set(transaction)))
    return count / len(transactions) if transactions else 0

def apriori(transactions, min_support):
    itemsets = []
    single_items = set(item for transaction in transactions for item in transaction)

    # Step 1: Get frequent 1-itemsets
    L1 = []
    for item in single_items:
        support = get_support([item], transactions)
        if support >= min_support:
            L1.append(([item], support))
    itemsets.extend(L1)
    if not L1:
        return itemsets

    # Step 2: Generate k-itemsets
    k = 2
    prev_frequent = [[item for item in items] for items, _ in L1]

    while prev_frequent:
        # Generate candidates by joining (k-1)-itemsets
        candidates = []
        for i in range(len(prev_frequent)):
            for j in range(i + 1, len(prev_frequent)):
                itemset1, itemset2 = prev_frequent[i], prev_frequent[j]
                # Ensure itemsets differ by one item
                if itemset1[:k-2] == itemset2[:k-2]:  # share first k-2 items
                    candidate = sorted(list(set(itemset1 + itemset2)))
                    if len(candidate) == k:
                        # Check if all (k-1)-subsets are frequent
                        subsets = [sorted(list(c)) for c in combinations(candidate, k-1)]
                        if all(sorted(s) in prev_frequent for s in subsets):
                            candidates.append(candidate)

        # Remove duplicates
        candidates = [list(c) for c in set(tuple(c) for c in candidates)]
        if not candidates:
            break

        # Check support for candidates
        Lk = []
        for candidate in candidates:
            support = get_support(candidate, transactions)
            if support >= min_support:
                Lk.append((candidate, support))

        if not Lk:
            break
        itemsets.extend(Lk)
        prev_frequent = [[item for item in items] for items, _ in Lk]
        k += 1

    return itemsets



In [None]:
def calc_confidence_lift(frequent_itemsets, transactions):
    print("\n Association Rules:")
    rules_found = False
    for itemset, support in frequent_itemsets:
        if len(itemset) >= 2:
            for i in range(1, len(itemset)):
                for antecedent in combinations(itemset, i):
                    antecedent = list(antecedent)
                    consequent = sorted(list(set(itemset) - set(antecedent)))
                    antecedent_support = get_support(antecedent, transactions)
                    consequent_support = get_support(consequent, transactions)
                    if antecedent_support > 0 and consequent_support > 0:
                        confidence = support / antecedent_support
                        lift = confidence / consequent_support
                        print(f"{antecedent} => {consequent} | support={support:.2f}, confidence={confidence:.2f}, lift={lift:.2f}")
                        rules_found = True
    if not rules_found:
        print("No association rules found. Try lowering min_support or check dataset.")



In [None]:
# Run Apriori algorithm
min_support = 0.1
frequent_itemsets = apriori(transactions_list, min_support)

# Print frequent itemsets
print("\n📌 Frequent Itemsets (Support >= 0.2):")
for itemset, support in frequent_itemsets:
    print(f"{itemset} - support: {support:.2f}")

# Calculate and print association rules
calc_confidence_lift(frequent_itemsets, transactions_list)


📌 Frequent Itemsets (Support >= 0.2):
['Space Suit'] - support: 0.32
['Robotic Arm'] - support: 0.34
['Carbon Dioxide Scrubbers'] - support: 0.24
['Treadmill'] - support: 0.28
['Food Packets'] - support: 0.40
['3D Printer'] - support: 0.28
['Sleeping Bag'] - support: 0.32
['Robotic Arm', 'Space Suit'] - support: 0.10
['3D Printer', 'Sleeping Bag'] - support: 0.10
['Food Packets', 'Sleeping Bag'] - support: 0.10
['Robotic Arm', 'Sleeping Bag'] - support: 0.10
['Food Packets', 'Treadmill'] - support: 0.12
['Food Packets', 'Space Suit'] - support: 0.10
['Food Packets', 'Robotic Arm'] - support: 0.14

 Association Rules:
['Robotic Arm'] => ['Space Suit'] | support=0.10, confidence=0.29, lift=0.92
['Space Suit'] => ['Robotic Arm'] | support=0.10, confidence=0.31, lift=0.92
['3D Printer'] => ['Sleeping Bag'] | support=0.10, confidence=0.36, lift=1.12
['Sleeping Bag'] => ['3D Printer'] | support=0.10, confidence=0.31, lift=1.12
['Food Packets'] => ['Sleeping Bag'] | support=0.10, confidence=

# **QUE 2**

In [None]:
from collections import defaultdict

# Simplified FP-Growth algorithm
def fpgrowth(transactions, min_support):
    # Step 1: Count item frequencies
    item_counts = defaultdict(int)
    for transaction in transactions:
        for item in transaction:
            item_counts[item] += 1

    # Step 2: Filter frequent items and sort by frequency
    total = len(transactions)
    frequent_items = [(item, count/total) for item, count in item_counts.items() if count/total >= min_support]
    frequent_items.sort(key=lambda x: (-x[1], x[0]))  # Sort by support (descending), then item
    frequent_itemset = {item for item, _ in frequent_items}

    # Step 3: Create ordered transactions
    ordered_transactions = []
    for transaction in transactions:
        ordered = [item for item in transaction if item in frequent_itemset]
        ordered.sort(key=lambda x: (-item_counts[x], x))  # Sort by frequency
        if ordered:
            ordered_transactions.append(ordered)

    # Step 4: Generate frequent patterns (simulating conditional pattern base)
    patterns = []
    for item, support in frequent_items:
        # Find transactions containing this item
        conditional_transactions = [
            [i for i in t if item_counts[i] >= item_counts[item] and i != item]
            for t in ordered_transactions if item in t
        ]
        # Generate patterns for this conditional base
        for k in range(1, max(len(t) for t in conditional_transactions) + 1):
            candidate_counts = defaultdict(int)
            for t in conditional_transactions:
                for combo in combinations(t, k):
                    candidate_counts[combo] += 1

            for combo, count in candidate_counts.items():
                # Add the current item to the pattern
                pattern = sorted(list(combo) + [item])
                support = count / total
                if support >= min_support:
                    patterns.append((pattern, support))

    # Include 1-itemsets
    patterns.extend([([item], support) for item, support in frequent_items])

    # Remove duplicates
    unique_patterns = {}
    for pattern, support in patterns:
        key = tuple(sorted(pattern))
        if key not in unique_patterns or unique_patterns[key] < support:
            unique_patterns[key] = support

    return [(list(k), v) for k, v in unique_patterns.items()]

# Association rules calculation (reused)
def calc_confidence_lift(frequent_itemsets, transactions):
    print("\n Association Rules:")
    rules_found = False
    for itemset, support in frequent_itemsets:
        if len(itemset) >= 2:
            for i in range(1, len(itemset)):
                for antecedent in combinations(itemset, i):
                    antecedent = list(antecedent)
                    consequent = sorted(list(set(itemset) - set(antecedent)))
                    antecedent_support = get_support(antecedent, transactions)
                    consequent_support = get_support(consequent, transactions)
                    if antecedent_support > 0 and consequent_support > 0:
                        confidence = support / antecedent_support
                        lift = confidence / consequent_support
                        print(f"{antecedent} => {consequent} | support={support:.2f}, confidence={confidence:.2f}, lift={lift:.2f}")
                        rules_found = True
    if not rules_found:
        print("No association rules found. Try lowering min_support or check dataset.")



In [None]:
print("\n=== FP-Growth Results ===")
frequent_patterns = fpgrowth(transactions_list, min_support)
print(" Frequent Itemsets from FP-Growth:")
for itemset, support in frequent_patterns:
    print(f"{itemset} - support: {support:.2f}")
calc_confidence_lift(frequent_patterns, transactions_list)




=== FP-Growth Results ===
 Frequent Itemsets from FP-Growth:
['Food Packets', 'Robotic Arm'] - support: 0.14
['Food Packets', 'Sleeping Bag'] - support: 0.10
['Robotic Arm', 'Sleeping Bag'] - support: 0.10
['Robotic Arm', 'Space Suit'] - support: 0.10
['Food Packets', 'Space Suit'] - support: 0.10
['3D Printer', 'Sleeping Bag'] - support: 0.10
['Food Packets', 'Treadmill'] - support: 0.12
['Food Packets'] - support: 0.40
['Robotic Arm'] - support: 0.34
['Sleeping Bag'] - support: 0.32
['Space Suit'] - support: 0.32
['3D Printer'] - support: 0.28
['Treadmill'] - support: 0.28
['Carbon Dioxide Scrubbers'] - support: 0.24

 Association Rules:
['Food Packets'] => ['Robotic Arm'] | support=0.14, confidence=0.35, lift=1.03
['Robotic Arm'] => ['Food Packets'] | support=0.14, confidence=0.41, lift=1.03
['Food Packets'] => ['Sleeping Bag'] | support=0.10, confidence=0.25, lift=0.78
['Sleeping Bag'] => ['Food Packets'] | support=0.10, confidence=0.31, lift=0.78
['Robotic Arm'] => ['Sleeping Bag

In [None]:
# Compare results
print(f"\n Apriori found {len(frequent_itemsets)} itemsets")
print(f" FP-Growth found {len(frequent_patterns)} itemsets")
print(" In real-world, FP-Growth is faster because it avoids re-scanning data by building a tree. Apriori does many passes.")


 Apriori found 14 itemsets
 FP-Growth found 14 itemsets
 In real-world, FP-Growth is faster because it avoids re-scanning data by building a tree. Apriori does many passes.


# Testing using prebuilt algorithm

In [None]:
#Testing using the pre build algorithm

!pip install mlxtend



In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
import warnings
warnings.filterwarnings('ignore')

# # Simulated dataset (replace with file loading in Colab)
# sample_data = [
#     ["cricket bat", "cricket ball", "gloves"],
#     ["football", "water bottle", "juice"],
#     ["cricket bat", "cricket ball", "ice cream"],
#     ["football", "juice"],
#     ["cricket bat", "gloves", "water bottle"],
#     ["football", "ice cream", "juice"],
#     ["cricket ball", "gloves"],
#     ["cricket bat", "football", "juice"]
# ]
# transactions_list = sample_data

# Uncomment to load from file in Colab

try:
    with open("/content/drive/MyDrive/DataWarehouseDataMining/space.txt", "r") as f:
        lines = f.readlines()[1:]  # skip header line
except FileNotFoundError:
    print("Error: 'space.txt' not found. Please check the file path.")
    raise

transactions_list = []
for line in lines:
    parts = line.strip().split(",")
    items = [item.strip() for item in parts[1:] if item.strip()]
    if items:
        transactions_list.append(items)

# Check a few parsed transactions
print("Sample transactions:", transactions_list[:5])

# Prepare one-hot encoded DataFrame for mlxtend
def prepare_data(transactions):
    unique_items = sorted(set(item for transaction in transactions for item in transaction))
    data = []
    for transaction in transactions:
        row = [1 if item in transaction else 0 for item in unique_items]
        data.append(row)
    return pd.DataFrame(data, columns=unique_items)

df = prepare_data(transactions_list)

# Run mlxtend Apriori
min_support = 0.01
print("\n=== mlxtend Apriori Results ===")
frequent_itemsets_apriori = apriori(df, min_support=min_support, use_colnames=True)
print("Frequent Itemsets:")
print(frequent_itemsets_apriori)
rules_apriori = association_rules(frequent_itemsets_apriori, metric="confidence", min_threshold=0.5)
print("\n Association Rules:")
if not rules_apriori.empty:
    print(rules_apriori[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("No association rules found.")

# Run mlxtend FP-Growth
print("\n=== mlxtend FP-Growth Results ===")
frequent_itemsets_fpgrowth = fpgrowth(df, min_support=min_support, use_colnames=True)
print("Frequent Itemsets:")
print(frequent_itemsets_fpgrowth)
rules_fpgrowth = association_rules(frequent_itemsets_fpgrowth, metric="confidence", min_threshold=0.5)
print("\nAssociation Rules:")
if not rules_fpgrowth.empty:
    print(rules_fpgrowth[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("No association rules found.")

# Compare results
print(f"\n mlxtend Apriori found {len(frequent_itemsets_apriori)} itemsets")
print(f" mlxtend FP-Growth found {len(frequent_itemsets_fpgrowth)} itemsets")

Sample transactions: [['Robotic Arm', 'Food Packets'], ['Sleeping Bag', 'Treadmill', 'Food Packets'], ['Robotic Arm', 'Space Suit', '3D Printer'], ['Food Packets', 'Carbon Dioxide Scrubbers'], ['Sleeping Bag', 'Space Suit']]

=== mlxtend Apriori Results ===
Frequent Itemsets:
    support                                           itemsets
0      0.28                                       (3D Printer)
1      0.24                         (Carbon Dioxide Scrubbers)
2      0.40                                     (Food Packets)
3      0.34                                      (Robotic Arm)
4      0.32                                     (Sleeping Bag)
5      0.32                                       (Space Suit)
6      0.28                                        (Treadmill)
7      0.08             (3D Printer, Carbon Dioxide Scrubbers)
8      0.04                         (3D Printer, Food Packets)
9      0.08                          (3D Printer, Robotic Arm)
10     0.10                   