In [22]:
# Import libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


In [24]:
# -------------------------
# Step 1 - Load Dataset
# -------------------------
df = pd.read_csv("Megastore_Dataset_Task_3 3.csv")

# Preview data
print(df.head())

   OrderID                         ProductName  Quantity     InvoiceDate  \
0   536370         INFLATABLE POLITICAL GLOBE         48  12/1/2010 8:45   
1   536370      SET2 RED RETROSPOT TEA TOWELS         18  12/1/2010 8:45   
2   536370     PANDA AND BUNNIES STICKER SHEET        12  12/1/2010 8:45   
3   536370       RED TOADSTOOL LED NIGHT LIGHT        24  12/1/2010 8:45   
4   536370  VINTAGE HEADS AND TAILS CARD GAME         24  12/1/2010 8:45   

  UnitPrice  TotalCost         Country DiscountApplied OrderPriority  \
0     $0.85     $40.80   United States             Yes          High   
1     $2.95     $53.10   United States             Yes          High   
2     $0.85     $10.20   United States             Yes          High   
3     $1.65     $39.60   United States             Yes          High   
4     $1.25     $30.00   United States             Yes          High   

      Region    Segment ExpeditedShipping PaymentMethod  \
0  Northeast  Corporate               Yes   Credit 

In [26]:
ordinal_vars = ['OrderPriority', 'ExpeditedShipping']
nominal_vars = ['Segment', 'PaymentMethod']

for col in ordinal_vars + nominal_vars:
    print(f"{col} unique values:", df[col].unique())



OrderPriority unique values: ['High' 'Medium']
ExpeditedShipping unique values: ['Yes' 'No']
Segment unique values: ['Corporate' 'Consumer']
PaymentMethod unique values: ['Credit Card' 'PayPal']


In [28]:
# -------------------------
# Step 3 - Encode Variables
# -------------------------
# Encoding ordinal variables
df['ExpeditedShipping'] = df['ExpeditedShipping'].map({'Yes': 1, 'No': 0})

order_priority_mapping = {'Low': 1, 'Medium': 2, 'High': 3, 'Critical': 4}
df['OrderPriority'] = df['OrderPriority'].map(order_priority_mapping)



In [8]:
# One-hot encode nominal variables
df_nominal_encoded = pd.get_dummies(df[nominal_vars])

# Combine ordinal + nominal
df_encoded = pd.concat([df[['OrderID']], df_nominal_encoded, df['ExpeditedShipping'], df['OrderPriority']], axis=1)

print("\nEncoded Data Preview:")
print(df_encoded.head())


Encoded Data Preview:
   OrderID  Segment_Consumer  Segment_Corporate  PaymentMethod_Credit Card  \
0   536370             False               True                       True   
1   536370             False               True                       True   
2   536370             False               True                       True   
3   536370             False               True                       True   
4   536370             False               True                       True   

   PaymentMethod_PayPal ExpeditedShipping OrderPriority  
0                 False               Yes          High  
1                 False               Yes          High  
2                 False               Yes          High  
3                 False               Yes          High  
4                 False               Yes          High  


In [30]:
# -------------------------
# Step 4 - Transactionalize Products
# -------------------------
# Grouping product names by OrderID
basket = df.groupby(['OrderID'])['ProductName'].apply(list)

# Convert to True/False format
te = TransactionEncoder()
te_data = te.fit(basket).transform(basket)

df_products = pd.DataFrame(te_data, columns=te.columns_)

print("\nTransactional Product Data:")
print(df_products.head())



Transactional Product Data:
    50S CHRISTMAS GIFT BAG LARGE   DOLLY GIRL BEAKER  \
0                          False               False   
1                          False               False   
2                          False               False   
3                          False               False   
4                          False               False   

    I LOVE LONDON MINI BACKPACK   NINE DRAWER OFFICE TIDY  \
0                         False                     False   
1                         False                     False   
2                         False                     False   
3                         False                     False   
4                         False                     False   

    SET 2 TEA TOWELS I LOVE LONDON    SPACEBOY BABY GIFT SET  \
0                             False                    False   
1                             False                    False   
2                             False                    False   
3          

In [32]:
# Save Product-Only dataset
df_products.to_csv("Product_Transactional_Dataset.csv", index=False)

print("Both datasets saved!")

Both datasets saved!


In [34]:
# -------------------------
# Run Apriori on Product-Only dataset
# -------------------------
frequent_itemsets = apriori(df_products, min_support=0.01, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [36]:
# Show top rules
print("\nTop Association Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False).head(10))



Top Association Rules:
                                             antecedents  \
75211  (CARD DOLLY GIRL , ALARM CLOCK BAKELIKE RED , ...   
82060  (ALARM CLOCK BAKELIKE GREEN, SET6 RED SPOTTY P...   
82055  (ALARM CLOCK BAKELIKE GREEN, SET6 RED SPOTTY P...   
4761                (SMALL DOLLY MIX DESIGN ORANGE BOWL)   
75210  (CARD DOLLY GIRL , ALARM CLOCK BAKELIKE PINK, ...   
75207  (CHILDRENS CUTLERY SPACEBOY , SPACEBOY BIRTHDA...   
75206  (ALARM CLOCK BAKELIKE PINK, CHILDRENS CUTLERY ...   
67669  (SET6 RED SPOTTY PAPER CUPS, SET10 BLUE POLKAD...   
75188  (ALARM CLOCK BAKELIKE RED , CHILDRENS CUTLERY ...   
71975  (ALARM CLOCK BAKELIKE GREEN, PLASTERS IN TIN C...   

                                             consequents   support  \
75211  (ALARM CLOCK BAKELIKE PINK, CHILDRENS CUTLERY ...  0.011338   
82060  (SET6 RED SPOTTY PAPER PLATES, ALARM CLOCK BAK...  0.011338   
82055  (SET6 RED SPOTTY PAPER CUPS, PLASTERS IN TIN C...  0.011338   
4761                      (SMALL MA

In [38]:
# Sort rules by lift
rules_sorted_by_lift = rules.sort_values(by='lift', ascending=False)

# Show rules table 
print("\nRules Table (Support, Confidence, Lift):")
rules_sorted_by_lift[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)




Rules Table (Support, Confidence, Lift):


Unnamed: 0,antecedents,consequents,support,confidence,lift
75211,"(CARD DOLLY GIRL , ALARM CLOCK BAKELIKE RED , ...","(ALARM CLOCK BAKELIKE PINK, CHILDRENS CUTLERY ...",0.011338,1.0,88.2
82060,"(ALARM CLOCK BAKELIKE GREEN, SET6 RED SPOTTY P...","(SET6 RED SPOTTY PAPER PLATES, ALARM CLOCK BAK...",0.011338,1.0,88.2
82055,"(ALARM CLOCK BAKELIKE GREEN, SET6 RED SPOTTY P...","(SET6 RED SPOTTY PAPER CUPS, PLASTERS IN TIN C...",0.011338,1.0,88.2
4761,(SMALL DOLLY MIX DESIGN ORANGE BOWL),(SMALL MARSHMALLOWS PINK BOWL),0.011338,1.0,88.2
75210,"(CARD DOLLY GIRL , ALARM CLOCK BAKELIKE PINK, ...","(CHILDRENS CUTLERY SPACEBOY , SPACEBOY BIRTHDA...",0.011338,1.0,88.2
75207,"(CHILDRENS CUTLERY SPACEBOY , SPACEBOY BIRTHDA...","(CARD DOLLY GIRL , ALARM CLOCK BAKELIKE PINK, ...",0.011338,1.0,88.2
75206,"(ALARM CLOCK BAKELIKE PINK, CHILDRENS CUTLERY ...","(CARD DOLLY GIRL , ALARM CLOCK BAKELIKE RED , ...",0.011338,1.0,88.2
67669,"(SET6 RED SPOTTY PAPER CUPS, SET10 BLUE POLKAD...","(SET10 PINK POLKADOT PARTY CANDLES, SET6 RED S...",0.011338,1.0,88.2
75188,"(ALARM CLOCK BAKELIKE RED , CHILDRENS CUTLERY ...","(CARD DOLLY GIRL , ALARM CLOCK BAKELIKE PINK)",0.011338,1.0,88.2
71975,"(ALARM CLOCK BAKELIKE GREEN, PLASTERS IN TIN C...","(PLASTERS IN TIN SPACEBOY, SKULL LUNCH BOX WIT...",0.011338,1.0,88.2


In [40]:
# -------------------------
#  Top 3 Relevant Rules
# -------------------------
top_rules = rules.sort_values(by='lift', ascending=False).head(3)

print("\nTop 3 Rules:")
for idx, rule in top_rules.iterrows():
    print(f"\nRule #{idx+1}")
    print("If items:", list(rule['antecedents']), "-> Then items:", list(rule['consequents']))
    print("Support:", rule['support'])
    print("Confidence:", rule['confidence'])
    print("Lift:", rule['lift'])


Top 3 Rules:

Rule #75212
If items: ['CARD DOLLY GIRL ', 'ALARM CLOCK BAKELIKE RED ', 'ROUND SNACK BOXES SET OF4 WOODLAND '] -> Then items: ['ALARM CLOCK BAKELIKE PINK', 'CHILDRENS CUTLERY SPACEBOY ', 'SPACEBOY BIRTHDAY CARD']
Support: 0.011337868480725623
Confidence: 1.0
Lift: 88.2

Rule #82061
If items: ['ALARM CLOCK BAKELIKE GREEN', 'SET6 RED SPOTTY PAPER CUPS', 'PLASTERS IN TIN CIRCUS PARADE ', 'ALARM CLOCK BAKELIKE RED '] -> Then items: ['SET6 RED SPOTTY PAPER PLATES', 'ALARM CLOCK BAKELIKE PINK', 'ROUND SNACK BOXES SET OF4 WOODLAND ']
Support: 0.011337868480725623
Confidence: 1.0
Lift: 88.2

Rule #82056
If items: ['ALARM CLOCK BAKELIKE GREEN', 'SET6 RED SPOTTY PAPER PLATES', 'ALARM CLOCK BAKELIKE PINK', 'ROUND SNACK BOXES SET OF4 WOODLAND '] -> Then items: ['SET6 RED SPOTTY PAPER CUPS', 'PLASTERS IN TIN CIRCUS PARADE ', 'ALARM CLOCK BAKELIKE RED ']
Support: 0.011337868480725623
Confidence: 1.0
Lift: 88.2
