In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import time
import pandas as pd
from itertools import combinations

In [None]:
datasets = [
        # "/content/sample_dataset.csv",
        # "/content/drive/MyDrive/Big Data/datasets/retail.csv",
        # "/content/drive/MyDrive/Big Data/datasets/chess.csv",
        "/content/drive/MyDrive/Big Data/datasets/mushroom.csv",
        # "/content/drive/MyDrive/Big Data/datasets/T10I4D100K.csv",
        # "/content/drive/MyDrive/Big Data/datasets/pumsb.csv",
]
min_support_ratios = [0.5, 0.4, 0.3, 0.2]
min_confidence = 0.3

## Apriori Linear

### Utility Functions

#### Load transaction data from a CSV file.

In [None]:
# def load_data(filename):
#     df = pd.read_csv(filename, header=None)
#     transactions = df.apply(lambda row: [item for item in row if not pd.isna(item)], axis=1)
#     transactions = [set(transaction) for transaction in transactions]
#     return transactions

def load_data(filename):
    df = pd.read_csv(filename, header=None)
    transactions = df.apply(lambda row: [item for item in row if not pd.isna(item) and item != ""], axis=1)
    transactions = [set(transaction) for transaction in transactions]
    return transactions

#### Generate the 1-itemsets from the transactions.

In [None]:
def generate_1_itemsets(transactions):
    item_counts = {}
    for transaction in transactions:
        for item in transaction:
            item_counts[frozenset([item])] = item_counts.get(frozenset([item]), 0) + 1
    return item_counts

#### Generate all frequent itemsets.

##### Utility Function : Generate candidate (k-itemsets) from the previous itemsets.

In [None]:
# def generate_candidates(prev_itemsets, k):
#     candidates = set()
#     itemsets = list(prev_itemsets)
#     for i in range(len(itemsets)):
#         for j in range(i + 1, len(itemsets)):
#             union_set = itemsets[i] | itemsets[j]
#             if len(union_set) == k:
#                 candidates.add(union_set)
#     return candidates

def generate_candidates(prev_itemsets, k):
    candidates = set()
    itemsets = list(prev_itemsets)
    for i in range(len(itemsets)):
        for j in range(i + 1, len(itemsets)):
            union_set = itemsets[i] | itemsets[j]
            # Check subset conditions inefficiently
            if len(union_set) == k and not any(union_set.issubset(other) for other in candidates):
                candidates.add(union_set)
    return candidates


##### Utility Function : Filter candidates by support count.

In [None]:
# def filter_candidates(transactions, candidates, min_support):
#     candidate_counts = {candidate: 0 for candidate in candidates}
#     for transaction in transactions:
#         for candidate in candidates:
#             if candidate.issubset(transaction):
#                 candidate_counts[candidate] += 1
#     return {itemset: count for itemset, count in candidate_counts.items() if count >= min_support}

def filter_candidates(transactions, candidates, min_support):
    candidate_counts = {candidate: 0 for candidate in candidates}

    # Process transactions
    for transaction in transactions:
        for candidate in candidates:
            # Single, less redundant subset check
            if all(item in transaction for item in candidate):  # Replace redundant subset check
                candidate_counts[candidate] += 1

    # Inefficient but single-pass filtering
    return {itemset: count for itemset, count in candidate_counts.items() if count >= min_support}



##### Generating all frequent itemsets...

In [None]:
def apriori_gen(itemsets, transactions, min_support):
    k = 2
    frequent_itemsets = {}

    # Add 1-itemsets
    frequent_itemsets.update(itemsets)

    # Generate k-itemsets
    while itemsets:
        print(f"Generating {k}-itemset...")
        candidates = generate_candidates(itemsets.keys(), k)
        itemsets = filter_candidates(transactions, candidates, min_support)
        frequent_itemsets.update(itemsets)
        print(f"Generated {k}-itemset.")
        k += 1

    return frequent_itemsets

#### Generate association rules from frequent itemsets.

In [None]:
def generate_rules(frequent_itemsets, transactions, min_confidence):
    """Generate association rules from frequent itemsets."""
    rules = []
    for itemset, support_count in frequent_itemsets.items():
        if len(itemset) > 1:
            subsets = list(find_subsets(itemset))
            for subset in subsets:
                antecedent = subset
                consequent = itemset - subset
                if consequent:
                    antecedent_support = frequent_itemsets.get(antecedent, 0)
                    if antecedent_support > 0:
                        confidence = support_count / antecedent_support
                        if confidence >= min_confidence:
                            rules.append({
                                'rule': (set(antecedent), set(consequent)),
                                'support': support_count,
                                'confidence': confidence
                            })
    return rules

In [None]:
def find_subsets(itemset):
    """Generate all subsets of a given itemset."""
    subsets = []
    for i in range(1, len(itemset)):
        subsets.extend(combinations(itemset, i))
    return [frozenset(subset) for subset in subsets]

### Main Function

In [None]:
def apriori(filename, min_support_ratio, min_confidence):
    """Main function to execute the Apriori algorithm."""
    print("Loading data...")
    transactions = load_data(filename)
    num_transactions = len(transactions)
    print(f"Loaded {num_transactions} transactions.")
    # print(transactions)

    # Calculate absolute min_support
    min_support = int(min_support_ratio * num_transactions)
    print(f"Minimum Support Count: {min_support}")

    print("Generating 1-itemsets...")
    one_itemsets = generate_1_itemsets(transactions)
    one_itemsets = {itemset: count for itemset, count in one_itemsets.items() if count >= min_support}
    print(f"Total {len(one_itemsets)} 1-itemsets generated")
    # print(one_itemsets)

    print(" 1-itemsets:")
    for itemset, count in one_itemsets.items():
        print(f"{set(itemset)}: {count}")


    print("Generating all frequent itemsets...")
    frequent_itemsets = apriori_gen(one_itemsets, transactions, min_support)
    print(f"Total {len(frequent_itemsets)} frequent itemsets generated")


    # print("Frequent Itemsets:")
    # for itemset, count in frequent_itemsets.items():
    #     print(f"{set(itemset)}: {count}")

    print("\nGenerating association rules...")
    rules = generate_rules(frequent_itemsets, transactions, min_confidence)
    print(f"Total {len(rules)} association rules generated")

    # for rule in rules:
    #     antecedent, consequent = rule['rule']
    #     print(f"Rule: {antecedent} -> {consequent} | Support: {rule['support']} | Confidence: {rule['confidence']:.2f}")

In [None]:
def benchmark(datasets, min_support_ratios, min_confidence):
    """Benchmark Apriori algorithm on multiple datasets and support thresholds."""
    results = []

    for dataset in datasets:
        for min_support_ratio in min_support_ratios:
            print(f"\nDataset: {dataset} | Min Support: {min_support_ratio} | Min Confidence: {min_confidence}")
            start_time = time.time()

            # Execute Apriori
            apriori(dataset, min_support_ratio, min_confidence)

            end_time = time.time()
            elapsed_time = end_time - start_time
            results.append({
                'Dataset': dataset,
                'Min Support': min_support_ratio,
                'Min Confidence': min_confidence,
                'Execution Time': elapsed_time,
            })

            print(f"Execution Time: {elapsed_time:.2f} seconds")

    # Return results for further analysis or export
    return results

In [None]:
# Run benchmark
results = benchmark(datasets, min_support_ratios, min_confidence)

# Export results to a CSV for analysis
results_df = pd.DataFrame(results)
results_df.to_csv("apriori_benchmark_results.csv", index=False)
print("\nBenchmark results saved to apriori_benchmark_results.csv")