In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

print("--- STARTING APRIORI MODEL (RAW ITEMS - NO MAPPING) ---\n")

# 1. LOAD DATASET
try:
    df = pd.read_csv('Groceries_dataset.csv')
    df['Member_number'] = df['Member_number'].astype(str)
    print("✓ Dataset Loaded Successfully")
except FileNotFoundError:
    print("❌ Error: 'Groceries_dataset.csv' not found.")
    exit()
#item_mapping = {
    # Meat
    #'sausage': 'Meat', 'frankfurter': 'Meat', 'ham': 'Meat', 'beef': 'Meat', 'pork': 'Meat', 'chicken': 'Meat', 'hamburger meat': 'Meat',
    # Dairy
    #'whole milk': 'Dairy', 'yogurt': 'Dairy', 'curd': 'Dairy', 'butter': 'Dairy', 'cheese': 'Dairy', 'frozen meals': 'Dairy',
    # Fruit & Veg
    #'citrus fruit': 'Fruit', 'tropical fruit': 'Fruit', 'berries': 'Fruit', 'pip fruit': 'Fruit', 'bananas': 'Fruit', 'grapes': 'Fruit',
    #'root vegetables': 'Vegetables', 'other vegetables': 'Vegetables', 'onions': 'Vegetables', 'herbs': 'Vegetables',
    # Bakery
    #'rolls/buns': 'Bakery', 'brown bread': 'Bakery', 'pastry': 'Bakery', 'white bread': 'Bakery',
    # Drinks
    #'soda': 'Drinks', 'bottled water': 'Drinks', 'bottled beer': 'Alcohol', 'canned beer': 'Alcohol'
#}
# ===============================
# 2. PREPROCESSING
# ===============================
# We skip the "Item Mapping" step and go straight to transaction format.
print("✓ Transforming to Transaction Format...")

basket = df.groupby(['Member_number', 'Date', 'itemDescription'])['itemDescription'] \
           .count().unstack().fillna(0)

# Convert to Boolean (0 or 1)
def encode_units(x):
    return 1 if x >= 1 else 0

basket_sets = basket.applymap(encode_units).astype(bool)

print(f"✓ Matrix Shape: {basket_sets.shape}")

# ===============================
# 3. TRAIN APRIORI MODEL
# ===============================
print("\n- Training Apriori Model...")

# CRITICAL CHANGE: We must lower min_support because raw items are sparse.
# 0.001 means the item must appear in at least 0.1% of transactions.
frequent_itemsets = apriori(basket_sets, min_support=0.001, use_colnames=True)

print(f"✓ Frequent Itemsets Found: {len(frequent_itemsets)}")

# ===============================
# 4. GENERATE RULES
# ===============================
# We use 'lift' as the metric because confidence will naturally be lower for specific items.
# Lift > 1.0 means the relationship is better than random chance.
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.1)
rules = rules.sort_values(by='lift', ascending=False)

print(f"✓ Association Rules Generated: {len(rules)}")

# ===============================
# 5. EVALUATION (Top 5 Rules)
# ===============================
print("\n--- TOP 5 STRONGEST RULES ---")
# Helper for clean display
display_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
display_rules['antecedents'] = display_rules['antecedents'].apply(lambda x: list(x)[0])
display_rules['consequents'] = display_rules['consequents'].apply(lambda x: list(x)[0])

print(display_rules.head(5).to_string(index=False))

print("\n--- MODEL STATS ---")
print(rules[['support', 'confidence', 'lift']].describe())

# ===============================
# 6. CONCLUSION
# ===============================
if not rules.empty:
    best_rule = rules.iloc[0]
    ant = list(best_rule['antecedents'])[0]
    con = list(best_rule['consequents'])[0]
    lift = best_rule['lift']
    
    print("\n✅ CONCLUSION:")
    print(f"The strongest relationship found is between '{ant}' and '{con}'.")
    print(f"Lift: {lift:.2f} (Customers are {lift:.2f}x more likely to buy these together).")
else:
    print("\nNo rules found. Try lowering min_support.")

# ===============================
# 7. TA DEMO
# ===============================
print("\n========================================")
print("      TA PRESENTATION DEMO SYSTEM      ")
print("========================================")
print("Type an Item (e.g., 'whole milk', 'sausage', 'yogurt') to see recommendations.")

all_items = list(basket_sets.columns)

while True:
    try:
        user_input = input("\nEnter Item Name (or 'exit'): ").strip()
        if user_input.lower() == 'exit': break
        
        # Match input to dataset columns (case-insensitive)
        found_item = None
        for item in all_items:
            if user_input.lower() == item.lower():
                found_item = item
                break
        
        if not found_item:
            print(f"❌ Item '{user_input}' not found. Try: whole milk, rolls/buns, soda")
            continue
            
        # Find rules where the item is the antecedent
        # We check if the found_item is inside the frozenset
        recs = rules[rules['antecedents'].apply(lambda x: found_item in x)]
        
        if recs.empty:
            print(f"No strong recommendation for {found_item}.")
        else:
            print(f"✅ If they buy '{found_item}', suggest:")
            seen = set()
            count = 0
            for idx, row in recs.head(5).iterrows():
                cons = list(row['consequents'])[0]
                if cons not in seen and cons != found_item:
                    print(f"   -> {cons} (Lift: {row['lift']:.2f})")
                    seen.add(cons)
                    count += 1
            if count == 0:
                 print("   (No specific recommendations found above threshold)")

    except Exception as e:
        print(f"Error: {e}")

--- STARTING APRIORI MODEL (RAW ITEMS - NO MAPPING) ---

✓ Dataset Loaded Successfully
✓ Transforming to Transaction Format...


  basket_sets = basket.applymap(encode_units).astype(bool)


✓ Matrix Shape: (14963, 167)

- Training Apriori Model...
✓ Frequent Itemsets Found: 750
✓ Association Rules Generated: 142

--- TOP 5 STRONGEST RULES ---
 antecedents         consequents  support  confidence     lift
     sausage          whole milk 0.001470    0.024363 2.182917
  whole milk             sausage 0.001470    0.131737 2.182917
  whole milk              yogurt 0.001470    0.164179 1.911760
      yogurt          whole milk 0.001470    0.017121 1.911760
citrus fruit specialty chocolate 0.001403    0.026415 1.653762

--- MODEL STATS ---
          support  confidence        lift
count  142.000000  142.000000  142.000000
mean     0.001483    0.056680    1.286441
std      0.000793    0.045302    0.191656
min      0.001002    0.006771    1.106100
25%      0.001069    0.022406    1.151307
50%      0.001337    0.042259    1.232030
75%      0.001470    0.079161    1.347267
max      0.005748    0.255814    2.182917

✅ CONCLUSION:
The strongest relationship found is between 'sausage'

In [1]:
# ===============================
# IMPORT LIBRARIES
# ===============================
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# LOAD DATASET
df = pd.read_csv('Groceries_dataset.csv')

# ===============================
# SIMPLE PREPROCESSING
# ===============================
# إنشاء جدول Market Basket بسيط: كل صف = Member + Date، كل عمود = منتج
basket = df.groupby(['Member_number', 'Date', 'itemDescription'])['itemDescription'] \
           .count().unstack().fillna(0)

# تحويل القيم لـ Boolean
basket_sets = basket.astype(bool)

# إزالة المنتجات النادرة
basket_sets = basket_sets.loc[:, basket_sets.sum() >= 20]

In [3]:
# ===============================
# TRAIN APRIORI MODEL
# ===============================
frequent_itemsets = apriori(basket_sets, min_support=0.001, use_colnames=True)

# ===============================
# GENERATE ASSOCIATION RULES
# ===============================
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.1)
rules = rules.sort_values(by='lift', ascending=False)

# ===============================
# Top 5 Strongest Rules
# ===============================
print("\nTop 5 Strongest Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(5))


Top 5 Strongest Rules:
               antecedents            consequents   support  confidence  \
238              (sausage)   (whole milk, yogurt)  0.001470    0.024363   
235   (whole milk, yogurt)              (sausage)  0.001470    0.131737   
234  (whole milk, sausage)               (yogurt)  0.001470    0.164179   
239               (yogurt)  (whole milk, sausage)  0.001470    0.017121   
86   (specialty chocolate)         (citrus fruit)  0.001403    0.087866   

         lift  
238  2.182917  
235  2.182917  
234  1.911760  
239  1.911760  
86   1.653762  


In [4]:
# ===============================
# EVALUATION
# ===============================
print("\nModel Evaluation:")
print(rules[['support', 'confidence', 'lift']].describe())

# ===============================
# CONCLUSION
# ===============================
if not rules.empty:
    best_rule = rules.iloc[0]
    ant = list(best_rule['antecedents'])[0]
    con = list(best_rule['consequents'])[0]
    lift = best_rule['lift']
    
    print("\nThe strongest relationship is between '{}' and '{}' with Lift {:.4f}".format(ant, con, lift))
else:
    print("\nNo significant rules found.")


Model Evaluation:
          support  confidence        lift
count  240.000000  240.000000  240.000000
mean     0.001612    0.055186    1.186579
std      0.000881    0.042429    0.191060
min      0.001002    0.006771    1.000136
25%      0.001136    0.024330    1.051529
50%      0.001370    0.041638    1.122790
75%      0.001671    0.074008    1.253237
max      0.005948    0.255814    2.182917

The strongest relationship is between 'sausage' and 'whole milk' with Lift 2.1829
