# DSA 2040 Practical Exam - Section 2, Task 4
## Association Rules - Market Basket Analysis

**Student:** Monaheng218  
**Date:** August 13, 2025  
**Total Marks:** 10

### Task Requirements:
1. Perform market basket analysis using association rules
2. Find frequent itemsets and strong association rules
3. Analyze support, confidence, and lift metrics
4. Identify product associations and cross-selling opportunities
5. Provide business recommendations for product placement and marketing

In [None]:
# =============================================================================
# IMPORTS AND SETUP
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import networkx as nx
from collections import Counter
import sqlite3
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)

print("✅ Association rules libraries imported successfully")
print("🛒 Ready for market basket analysis")

In [None]:
# =============================================================================
# LOAD AND PREPARE TRANSACTION DATA
# =============================================================================

print("\n" + "="*60)
print("STEP 1: LOADING AND PREPARING TRANSACTION DATA")
print("="*60)

# Connect to the data warehouse
try:
    conn = sqlite3.connect('../Section1_DataWarehousing/retail_dw.db')
    print("✅ Connected to data warehouse")
except:
    # Fallback to preprocessed data if warehouse not available
    print("⚠️ Data warehouse not found, creating sample transaction data...")
    
    # Create sample transaction data for demonstration
    np.random.seed(42)
    products = [
        'Electronics_Laptop', 'Electronics_Phone', 'Electronics_Tablet',
        'Clothing_Shirt', 'Clothing_Pants', 'Clothing_Shoes',
        'Home_Furniture', 'Home_Decor', 'Home_Kitchen',
        'Books_Fiction', 'Books_NonFiction', 'Books_Technical',
        'Sports_Equipment', 'Sports_Clothing', 'Sports_Accessories'
    ]
    
    transactions = []
    for i in range(2000):
        # Create realistic shopping baskets
        basket_size = np.random.choice([1, 2, 3, 4, 5], p=[0.3, 0.3, 0.2, 0.15, 0.05])
        
        # Create category preferences
        if np.random.random() < 0.3:  # Electronics preference
            category_products = [p for p in products if p.startswith('Electronics')]
            other_products = [p for p in products if not p.startswith('Electronics')]
            basket = np.random.choice(category_products, min(basket_size, len(category_products)), replace=False).tolist()
            if len(basket) < basket_size:
                additional = np.random.choice(other_products, basket_size - len(basket), replace=False)
                basket.extend(additional)
        elif np.random.random() < 0.3:  # Clothing preference
            category_products = [p for p in products if p.startswith('Clothing')]
            other_products = [p for p in products if not p.startswith('Clothing')]
            basket = np.random.choice(category_products, min(basket_size, len(category_products)), replace=False).tolist()
            if len(basket) < basket_size:
                additional = np.random.choice(other_products, basket_size - len(basket), replace=False)
                basket.extend(additional)
        else:  # Random basket
            basket = np.random.choice(products, basket_size, replace=False).tolist()
        
        transactions.append(basket)
    
    print(f"✅ Created {len(transactions)} sample transactions")
    conn = None

# Load transaction data from data warehouse if available
if conn:
    # Extract transaction data with product information
    query = """
    SELECT 
        sf.SaleID,
        sf.CustomerID,
        pd.Category || '_' || pd.ProductName as Product
    FROM SalesFact sf
    JOIN ProductDim pd ON sf.ProductID = pd.ProductID
    ORDER BY sf.SaleID, sf.CustomerID
    """
    
    transaction_data = pd.read_sql_query(query, conn)
    conn.close()
    
    print(f"📊 Loaded {len(transaction_data)} transaction records")
    print(f"   Unique transactions: {transaction_data['SaleID'].nunique()}")
    print(f"   Unique customers: {transaction_data['CustomerID'].nunique()}")
    print(f"   Unique products: {transaction_data['Product'].nunique()}")
    
    # Group by transaction to create baskets
    transactions = transaction_data.groupby('SaleID')['Product'].apply(list).tolist()
    
    print(f"\n🛒 Transaction Statistics:")
    basket_sizes = [len(basket) for basket in transactions]
    print(f"   Average basket size: {np.mean(basket_sizes):.2f}")
    print(f"   Median basket size: {np.median(basket_sizes):.0f}")
    print(f"   Max basket size: {max(basket_sizes)}")
    print(f"   Min basket size: {min(basket_sizes)}")

# Display sample transactions
print(f"\n📋 Sample Transactions:")
for i, transaction in enumerate(transactions[:5]):
    print(f"   Transaction {i+1}: {transaction}")

# Analyze product frequency
all_products = [item for transaction in transactions for item in transaction]
product_frequency = Counter(all_products)
top_products = product_frequency.most_common(10)

print(f"\n📊 Top 10 Most Frequent Products:")
for product, count in top_products:
    percentage = (count / len(all_products)) * 100
    print(f"   {product}: {count} ({percentage:.1f}%)")

# Visualize basket size distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
basket_sizes = [len(basket) for basket in transactions]
plt.hist(basket_sizes, bins=range(1, max(basket_sizes)+2), alpha=0.7, edgecolor='black')
plt.title('Distribution of Basket Sizes', fontsize=14, fontweight='bold')
plt.xlabel('Number of Items in Basket')
plt.ylabel('Number of Transactions')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
top_products_df = pd.DataFrame(top_products, columns=['Product', 'Frequency'])
plt.barh(range(len(top_products_df)), top_products_df['Frequency'], alpha=0.8)
plt.yticks(range(len(top_products_df)), top_products_df['Product'])
plt.title('Top 10 Most Frequent Products', fontsize=14, fontweight='bold')
plt.xlabel('Frequency')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✅ Transaction data loaded and analyzed successfully")

In [None]:
# =============================================================================
# CREATE BINARY TRANSACTION MATRIX
# =============================================================================

print("\n" + "="*60)
print("STEP 2: CREATING BINARY TRANSACTION MATRIX")
print("="*60)

# Use TransactionEncoder to convert transactions to binary matrix
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
transaction_df = pd.DataFrame(te_ary, columns=te.columns_)

print(f"📊 Binary transaction matrix created:")
print(f"   Shape: {transaction_df.shape}")
print(f"   Transactions: {transaction_df.shape[0]}")
print(f"   Unique products: {transaction_df.shape[1]}")

# Calculate product support (frequency)
product_support = transaction_df.mean().sort_values(ascending=False)
print(f"\n📈 Product Support (Top 10):")
for product, support in product_support.head(10).items():
    print(f"   {product}: {support:.3f} ({support*100:.1f}%)")

# Display first few rows of the matrix
print(f"\n📋 Sample of Binary Transaction Matrix:")
print(transaction_df.head().to_string())

# Analyze sparsity
total_elements = transaction_df.shape[0] * transaction_df.shape[1]
non_zero_elements = transaction_df.sum().sum()
sparsity = 1 - (non_zero_elements / total_elements)

print(f"\n📊 Matrix Statistics:")
print(f"   Total elements: {total_elements:,}")
print(f"   Non-zero elements: {non_zero_elements:,}")
print(f"   Sparsity: {sparsity:.3f} ({sparsity*100:.1f}%)")
print(f"   Average items per transaction: {non_zero_elements/transaction_df.shape[0]:.2f}")

# Visualize product support distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(product_support.values, bins=20, alpha=0.7, edgecolor='black')
plt.title('Distribution of Product Support', fontsize=14, fontweight='bold')
plt.xlabel('Support (Frequency)')
plt.ylabel('Number of Products')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(range(1, len(product_support)+1), sorted(product_support.values, reverse=True), 
         marker='o', linewidth=2, markersize=4)
plt.title('Product Support Ranking (Pareto Analysis)', fontsize=14, fontweight='bold')
plt.xlabel('Product Rank')
plt.ylabel('Support')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✅ Binary transaction matrix created successfully")

In [None]:
# =============================================================================
# GENERATE FREQUENT ITEMSETS
# =============================================================================

print("\n" + "="*60)
print("STEP 3: GENERATING FREQUENT ITEMSETS")
print("="*60)

# Set minimum support threshold
min_support = 0.01  # 1% minimum support
print(f"🎯 Minimum support threshold: {min_support} ({min_support*100}%)")

# Generate frequent itemsets using Apriori algorithm
print("\n🔍 Running Apriori algorithm...")
frequent_itemsets = apriori(transaction_df, min_support=min_support, use_colnames=True, verbose=1)

if len(frequent_itemsets) == 0:
    print("⚠️ No frequent itemsets found with current threshold. Reducing threshold...")
    min_support = 0.005  # Reduce to 0.5%
    frequent_itemsets = apriori(transaction_df, min_support=min_support, use_colnames=True, verbose=1)
    
    if len(frequent_itemsets) == 0:
        print("⚠️ Still no frequent itemsets. Using very low threshold...")
        min_support = 0.001  # Reduce to 0.1%
        frequent_itemsets = apriori(transaction_df, min_support=min_support, use_colnames=True, verbose=1)

print(f"\n📊 Frequent Itemsets Analysis:")
print(f"   Total frequent itemsets found: {len(frequent_itemsets)}")
print(f"   Final minimum support used: {min_support} ({min_support*100}%)")

if len(frequent_itemsets) > 0:
    # Add itemset length column
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    
    # Analyze itemsets by length
    itemset_length_counts = frequent_itemsets['length'].value_counts().sort_index()
    print(f"\n📊 Itemsets by Length:")
    for length, count in itemset_length_counts.items():
        print(f"   {length}-itemsets: {count}")
    
    # Display top frequent itemsets
    print(f"\n🏆 Top 20 Frequent Itemsets:")
    top_itemsets = frequent_itemsets.nlargest(20, 'support')
    for idx, row in top_itemsets.iterrows():
        itemset_str = ', '.join(list(row['itemsets']))
        print(f"   {itemset_str}: {row['support']:.3f} ({row['support']*100:.1f}%)")
    
    # Visualize frequent itemsets
    plt.figure(figsize=(15, 10))
    
    # Support distribution
    plt.subplot(2, 2, 1)
    plt.hist(frequent_itemsets['support'], bins=20, alpha=0.7, edgecolor='black')
    plt.title('Distribution of Itemset Support', fontsize=14, fontweight='bold')
    plt.xlabel('Support')
    plt.ylabel('Number of Itemsets')
    plt.grid(True, alpha=0.3)
    
    # Itemsets by length
    plt.subplot(2, 2, 2)
    itemset_length_counts.plot(kind='bar', alpha=0.8)
    plt.title('Number of Itemsets by Length', fontsize=14, fontweight='bold')
    plt.xlabel('Itemset Length')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.grid(True, alpha=0.3)
    
    # Support vs Length
    plt.subplot(2, 2, 3)
    for length in sorted(frequent_itemsets['length'].unique()):
        length_data = frequent_itemsets[frequent_itemsets['length'] == length]
        plt.scatter([length] * len(length_data), length_data['support'], 
                   alpha=0.6, s=50, label=f'{length}-itemsets')
    plt.title('Support vs Itemset Length', fontsize=14, fontweight='bold')
    plt.xlabel('Itemset Length')
    plt.ylabel('Support')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Top itemsets bar chart
    plt.subplot(2, 2, 4)
    top_10 = frequent_itemsets.nlargest(10, 'support')
    itemset_labels = [', '.join(list(itemsets)[:2]) + ('...' if len(itemsets) > 2 else '') 
                     for itemsets in top_10['itemsets']]
    plt.barh(range(len(top_10)), top_10['support'], alpha=0.8)
    plt.yticks(range(len(top_10)), itemset_labels)
    plt.title('Top 10 Frequent Itemsets', fontsize=14, fontweight='bold')
    plt.xlabel('Support')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Save frequent itemsets
    frequent_itemsets_export = frequent_itemsets.copy()
    frequent_itemsets_export['itemsets_str'] = frequent_itemsets_export['itemsets'].apply(
        lambda x: ', '.join(list(x))
    )
    frequent_itemsets_export[['itemsets_str', 'support', 'length']].to_csv(
        'frequent_itemsets.csv', index=False
    )
    print(f"\n💾 Frequent itemsets saved to 'frequent_itemsets.csv'")
    
else:
    print("❌ No frequent itemsets found. Consider reducing the minimum support threshold.")

print("\n✅ Frequent itemsets generation completed")

In [None]:
# =============================================================================
# GENERATE ASSOCIATION RULES
# =============================================================================

print("\n" + "="*60)
print("STEP 4: GENERATING ASSOCIATION RULES")
print("="*60)

if len(frequent_itemsets) > 0:
    # Generate association rules
    min_confidence = 0.5  # 50% minimum confidence
    print(f"🎯 Minimum confidence threshold: {min_confidence} ({min_confidence*100}%)")
    
    print("\n🔍 Generating association rules...")
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    
    if len(rules) == 0:
        print("⚠️ No rules found with current confidence. Reducing threshold...")
        min_confidence = 0.3  # Reduce to 30%
        rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
        
        if len(rules) == 0:
            print("⚠️ Still no rules. Using very low threshold...")
            min_confidence = 0.1  # Reduce to 10%
            rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    
    print(f"\n📊 Association Rules Analysis:")
    print(f"   Total rules generated: {len(rules)}")
    print(f"   Final minimum confidence used: {min_confidence} ({min_confidence*100}%)")
    
    if len(rules) > 0:
        # Calculate additional metrics
        rules['leverage'] = rules['support'] - (rules['antecedent support'] * rules['consequent support'])
        rules['conviction'] = (1 - rules['consequent support']) / (1 - rules['confidence'])
        
        # Sort by multiple criteria
        rules_sorted = rules.sort_values(['confidence', 'lift', 'support'], ascending=False)
        
        # Display rule statistics
        print(f"\n📊 Rule Statistics:")
        print(f"   Average confidence: {rules['confidence'].mean():.3f}")
        print(f"   Average lift: {rules['lift'].mean():.3f}")
        print(f"   Average support: {rules['support'].mean():.3f}")
        print(f"   Rules with lift > 1: {(rules['lift'] > 1).sum()} ({(rules['lift'] > 1).mean()*100:.1f}%)")
        
        # Display top rules
        print(f"\n🏆 Top 20 Association Rules (by Confidence):")
        print("=" * 120)
        print(f"{'Antecedent':<30} {'Consequent':<30} {'Supp':<6} {'Conf':<6} {'Lift':<6} {'Conv':<6}")
        print("=" * 120)
        
        for idx, row in rules_sorted.head(20).iterrows():
            antecedent = ', '.join(list(row['antecedents']))
            consequent = ', '.join(list(row['consequents']))
            antecedent = antecedent[:28] + '..' if len(antecedent) > 30 else antecedent
            consequent = consequent[:28] + '..' if len(consequent) > 30 else consequent
            
            print(f"{antecedent:<30} {consequent:<30} {row['support']:<6.3f} {row['confidence']:<6.3f} "
                  f"{row['lift']:<6.2f} {row['conviction']:<6.2f}")
        
        # Analyze rules by lift
        high_lift_rules = rules[rules['lift'] > 1.5]
        print(f"\n🚀 High-Lift Rules (Lift > 1.5): {len(high_lift_rules)} rules")
        
        if len(high_lift_rules) > 0:
            print("\n🎯 Top High-Lift Rules:")
            for idx, row in high_lift_rules.nlargest(10, 'lift').iterrows():
                antecedent = ', '.join(list(row['antecedents']))
                consequent = ', '.join(list(row['consequents']))
                print(f"   {antecedent} → {consequent}")
                print(f"      Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.2f}")
        
        # Visualize association rules
        plt.figure(figsize=(16, 12))
        
        # Support vs Confidence scatter plot
        plt.subplot(2, 3, 1)
        scatter = plt.scatter(rules['support'], rules['confidence'], 
                            c=rules['lift'], cmap='viridis', alpha=0.6, s=50)
        plt.colorbar(scatter, label='Lift')
        plt.xlabel('Support')
        plt.ylabel('Confidence')
        plt.title('Support vs Confidence (colored by Lift)', fontsize=12, fontweight='bold')
        plt.grid(True, alpha=0.3)
        
        # Lift distribution
        plt.subplot(2, 3, 2)
        plt.hist(rules['lift'], bins=20, alpha=0.7, edgecolor='black')
        plt.axvline(x=1, color='red', linestyle='--', label='Lift = 1')
        plt.xlabel('Lift')
        plt.ylabel('Number of Rules')
        plt.title('Distribution of Lift Values', fontsize=12, fontweight='bold')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Confidence distribution
        plt.subplot(2, 3, 3)
        plt.hist(rules['confidence'], bins=20, alpha=0.7, edgecolor='black')
        plt.xlabel('Confidence')
        plt.ylabel('Number of Rules')
        plt.title('Distribution of Confidence Values', fontsize=12, fontweight='bold')
        plt.grid(True, alpha=0.3)
        
        # Support vs Lift
        plt.subplot(2, 3, 4)
        plt.scatter(rules['support'], rules['lift'], alpha=0.6, s=50)
        plt.axhline(y=1, color='red', linestyle='--', label='Lift = 1')
        plt.xlabel('Support')
        plt.ylabel('Lift')
        plt.title('Support vs Lift', fontsize=12, fontweight='bold')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Confidence vs Lift
        plt.subplot(2, 3, 5)
        plt.scatter(rules['confidence'], rules['lift'], alpha=0.6, s=50)
        plt.axhline(y=1, color='red', linestyle='--', label='Lift = 1')
        plt.xlabel('Confidence')
        plt.ylabel('Lift')
        plt.title('Confidence vs Lift', fontsize=12, fontweight='bold')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Top rules by different metrics
        plt.subplot(2, 3, 6)
        top_by_lift = rules.nlargest(10, 'lift')
        rule_labels = [f"{', '.join(list(row['antecedents'])[:1])} → {', '.join(list(row['consequents'])[:1])}" 
                      for _, row in top_by_lift.iterrows()]
        rule_labels = [label[:20] + '...' if len(label) > 20 else label for label in rule_labels]
        plt.barh(range(len(top_by_lift)), top_by_lift['lift'], alpha=0.8)
        plt.yticks(range(len(top_by_lift)), rule_labels)
        plt.xlabel('Lift')
        plt.title('Top 10 Rules by Lift', fontsize=12, fontweight='bold')
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Save association rules
        rules_export = rules.copy()
        rules_export['antecedents_str'] = rules_export['antecedents'].apply(lambda x: ', '.join(list(x)))
        rules_export['consequents_str'] = rules_export['consequents'].apply(lambda x: ', '.join(list(x)))
        
        export_columns = ['antecedents_str', 'consequents_str', 'support', 'confidence', 
                         'lift', 'leverage', 'conviction']
        rules_export[export_columns].to_csv('association_rules.csv', index=False)
        print(f"\n💾 Association rules saved to 'association_rules.csv'")
        
    else:
        print("❌ No association rules found. Consider reducing the confidence threshold.")
        rules = pd.DataFrame()  # Empty dataframe for later use
        
else:
    print("❌ Cannot generate rules without frequent itemsets.")
    rules = pd.DataFrame()  # Empty dataframe for later use

print("\n✅ Association rules generation completed")

In [None]:
# =============================================================================
# NETWORK VISUALIZATION OF ASSOCIATION RULES
# =============================================================================

print("\n" + "="*60)
print("STEP 5: NETWORK VISUALIZATION OF ASSOCIATION RULES")
print("="*60)

if len(rules) > 0:
    # Create network graph of top rules
    print("\n🕸️ Creating network visualization...")
    
    # Select top rules for visualization (to avoid cluttered graph)
    top_rules_for_network = rules.nlargest(20, 'lift')
    
    # Create directed graph
    G = nx.DiGraph()
    
    # Add nodes and edges
    for idx, row in top_rules_for_network.iterrows():
        antecedents = list(row['antecedents'])
        consequents = list(row['consequents'])
        
        # Add nodes
        for item in antecedents + consequents:
            if not G.has_node(item):
                G.add_node(item)
        
        # Add edges (for simplicity, connect first antecedent to first consequent)
        if antecedents and consequents:
            G.add_edge(antecedents[0], consequents[0], 
                      weight=row['lift'], 
                      confidence=row['confidence'],
                      support=row['support'])
    
    # Create network visualization
    plt.figure(figsize=(15, 10))
    
    # Calculate layout
    pos = nx.spring_layout(G, k=3, iterations=50)
    
    # Draw nodes
    node_sizes = [300 + G.degree(node) * 100 for node in G.nodes()]
    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, 
                          node_color='lightblue', alpha=0.7)
    
    # Draw edges with thickness based on lift
    edges = G.edges(data=True)
    edge_weights = [edge[2]['weight'] for edge in edges]
    edge_widths = [min(max(weight * 2, 1), 5) for weight in edge_weights]
    
    nx.draw_networkx_edges(G, pos, width=edge_widths, 
                          alpha=0.6, edge_color='gray', arrows=True, 
                          arrowsize=20, arrowstyle='->')
    
    # Draw labels
    labels = {node: node.split('_')[-1][:8] for node in G.nodes()}  # Shortened labels
    nx.draw_networkx_labels(G, pos, labels, font_size=8, font_weight='bold')
    
    plt.title('Association Rules Network\n(Top 20 Rules by Lift)', 
             fontsize=16, fontweight='bold')
    plt.text(0.02, 0.98, 'Node size: degree centrality\nEdge width: lift value', 
            transform=plt.gca().transAxes, fontsize=10, 
            verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    # Network statistics
    print(f"\n📊 Network Statistics:")
    print(f"   Nodes (products): {G.number_of_nodes()}")
    print(f"   Edges (rules): {G.number_of_edges()}")
    print(f"   Average degree: {2 * G.number_of_edges() / G.number_of_nodes():.2f}")
    
    # Find most connected products
    degree_centrality = nx.degree_centrality(G)
    top_central_nodes = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
    
    print(f"\n🔗 Most Connected Products:")
    for node, centrality in top_central_nodes:
        print(f"   {node}: {centrality:.3f} (degree: {G.degree(node)})")

else:
    print("❌ Cannot create network visualization without association rules.")

print("\n✅ Network visualization completed")

In [None]:
# =============================================================================
# BUSINESS INSIGHTS AND RECOMMENDATIONS
# =============================================================================

print("\n" + "="*70)
print("BUSINESS INSIGHTS AND MARKET BASKET ANALYSIS")
print("="*70)

# Analyze product categories and associations
print(f"\n🛒 MARKET BASKET ANALYSIS INSIGHTS:")
print("=" * 50)

if len(rules) > 0:
    # Category-based analysis
    print(f"\n📊 Rule Analysis by Product Categories:")
    
    # Extract categories from product names
    def extract_category(product_name):
        return product_name.split('_')[0] if '_' in product_name else 'Unknown'
    
    # Analyze cross-category associations
    cross_category_rules = []
    within_category_rules = []
    
    for idx, row in rules.iterrows():
        antecedent_categories = {extract_category(item) for item in row['antecedents']}
        consequent_categories = {extract_category(item) for item in row['consequents']}
        
        if antecedent_categories.isdisjoint(consequent_categories):
            cross_category_rules.append(row)
        else:
            within_category_rules.append(row)
    
    print(f"\n🔄 Cross-Category vs Within-Category Rules:")
    print(f"   Cross-category rules: {len(cross_category_rules)} ({len(cross_category_rules)/len(rules)*100:.1f}%)")
    print(f"   Within-category rules: {len(within_category_rules)} ({len(within_category_rules)/len(rules)*100:.1f}%)")
    
    # Top cross-category associations
    if cross_category_rules:
        cross_df = pd.DataFrame(cross_category_rules)
        top_cross = cross_df.nlargest(10, 'lift')
        
        print(f"\n🔗 Top Cross-Category Associations:")
        for idx, row in top_cross.iterrows():
            antecedent = ', '.join(list(row['antecedents']))
            consequent = ', '.join(list(row['consequents']))
            print(f"   {antecedent} → {consequent}")
            print(f"      Lift: {row['lift']:.2f}, Confidence: {row['confidence']:.3f}")
    
    # Product recommendation analysis
    print(f"\n🎯 PRODUCT RECOMMENDATION INSIGHTS:")
    print("=" * 50)
    
    # Find products that are frequently bought together
    recommendation_dict = {}
    
    for idx, row in rules.nlargest(20, 'confidence').iterrows():
        for antecedent in row['antecedents']:
            if antecedent not in recommendation_dict:
                recommendation_dict[antecedent] = []
            
            for consequent in row['consequents']:
                recommendation_dict[antecedent].append({
                    'product': consequent,
                    'confidence': row['confidence'],
                    'lift': row['lift']
                })
    
    print(f"\n🛍️ Product Recommendation Engine:")
    print("   If customer buys... they are likely to also buy:")
    
    for product, recommendations in list(recommendation_dict.items())[:5]:
        print(f"\n   📦 {product}:")
        # Sort recommendations by confidence
        sorted_recs = sorted(recommendations, key=lambda x: x['confidence'], reverse=True)[:3]
        for i, rec in enumerate(sorted_recs, 1):
            print(f"      {i}. {rec['product']} (confidence: {rec['confidence']:.3f}, lift: {rec['lift']:.2f})")

# Seasonal and temporal analysis (if timestamp data available)
print(f"\n📅 STRATEGIC BUSINESS RECOMMENDATIONS:")
print("=" * 50)

strategic_recommendations = [
    "🛒 Store Layout Optimization:",
    "   • Place frequently associated products near each other",
    "   • Create product bundles based on high-lift associations",
    "   • Position complementary items at end caps",
    
    "\n💰 Pricing and Promotion Strategies:",
    "   • Offer discounts on antecedent products to drive consequent sales",
    "   • Create bundle pricing for strongly associated items",
    "   • Use cross-selling promotions during peak shopping periods",
    
    "\n📧 Personalized Marketing:",
    "   • Send targeted recommendations based on purchase history",
    "   • Create personalized email campaigns for product combinations",
    "   • Develop customer-specific promotional offers",
    
    "\n📦 Inventory Management:",
    "   • Coordinate inventory levels for associated products",
    "   • Predict demand for consequent items based on antecedent sales",
    "   • Optimize supply chain for product bundles",
    
    "\n🎯 Customer Segmentation:",
    "   • Identify customer groups based on purchasing patterns",
    "   • Develop targeted campaigns for different basket types",
    "   • Create loyalty programs based on frequent associations"
]

for recommendation in strategic_recommendations:
    print(recommendation)

# Performance metrics summary
print(f"\n📊 ANALYSIS PERFORMANCE SUMMARY:")
print("=" * 50)
print(f"✅ Total transactions analyzed: {len(transactions):,}")
print(f"✅ Unique products identified: {len(te.columns_):,}")
print(f"✅ Frequent itemsets found: {len(frequent_itemsets) if len(frequent_itemsets) > 0 else 0:,}")
print(f"✅ Association rules generated: {len(rules) if len(rules) > 0 else 0:,}")
print(f"✅ Minimum support threshold: {min_support} ({min_support*100:.1f}%)")
if len(rules) > 0:
    print(f"✅ Average rule confidence: {rules['confidence'].mean():.3f}")
    print(f"✅ Average rule lift: {rules['lift'].mean():.2f}")
    print(f"✅ Strong rules (lift > 1): {(rules['lift'] > 1).sum():,} ({(rules['lift'] > 1).mean()*100:.1f}%)")

# Export final summary
analysis_summary = {
    'transactions_count': len(transactions),
    'unique_products': len(te.columns_) if 'te' in locals() else 0,
    'frequent_itemsets_count': len(frequent_itemsets) if len(frequent_itemsets) > 0 else 0,
    'association_rules_count': len(rules) if len(rules) > 0 else 0,
    'min_support_used': min_support if 'min_support' in locals() else 0,
    'avg_confidence': rules['confidence'].mean() if len(rules) > 0 else 0,
    'avg_lift': rules['lift'].mean() if len(rules) > 0 else 0,
    'strong_rules_count': (rules['lift'] > 1).sum() if len(rules) > 0 else 0
}

import json
with open('market_basket_analysis_summary.json', 'w') as f:
    json.dump(analysis_summary, f, indent=2)

print(f"\n💾 Analysis summary saved to 'market_basket_analysis_summary.json'")

print("\n" + "="*70)
print("✅ MARKET BASKET ANALYSIS COMPLETED SUCCESSFULLY")
print("Association rules identified and business insights generated")
print("="*70)