# **Lab 6, Information Retrieval** #

### **Authors:** ###
**1. Kuba Czech, 156035**

**2. Wojciech Nagórka, 156045**

In [2]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

You shall already be familiar with the concept of association rules and the apriori algorithm. Association rule mining is a method for discovering patterns within large data sets. It focuses on identifying relationships between variables and leveraging those connections to make predictions or informed decisions. The primary objective is to uncover rules that reveal the associations between various items in the data.



## **Task 1 - loading data** ##
Load data from data.txt file - it contains lists of grocery shopping done by nearly 2000 customers.
Store it in a boolean one hot encoded dataframe - True for items bought in a given transaction, False otherwise.

In [3]:
f = open("data.txt", "r")
data = f.read()
data_read = []
for rec in data.split('\n'):
    l = []
    rec = rec[1:-1] # getting rid of [ and ]
    rec = "".join(rec.split(',')).split()
    for str in rec:
        str = str[1:-1]
        l.append(str)
    data_read.append(list(set(l)))
data_read

[['orange', 'chicken', 'apple', 'milk', 'banana', 'chocolate'],
 ['yogurt', 'milk', 'cheese'],
 ['chicken'],
 ['pork', 'milk'],
 ['chicken'],
 ['sausage', 'cheese', 'chicken', 'beef'],
 ['milk', 'mustard'],
 ['bread'],
 ['orange',
  'ketchup',
  'sausage',
  'apple',
  'cheese',
  'beef',
  'banana',
  'pork',
  'grill'],
 ['orange', 'cheese', 'chicken'],
 ['bread', 'banana', 'apple', 'pork'],
 ['apple', 'chicken'],
 ['yogurt', 'milk'],
 ['chicken'],
 ['banana', 'cheese', 'beef'],
 ['bread', 'sausage', 'pork', 'cheese'],
 ['chocolate', 'yogurt', 'milk'],
 ['orange', 'apple', 'cheese', 'beef', 'banana', 'mustard', 'yogurt'],
 ['chicken'],
 ['orange', 'banana', 'milk', 'sausage'],
 ['chicken'],
 ['bread', 'pork'],
 ['bread', 'yogurt', 'chicken'],
 ['orange', 'chicken', 'sausage', 'milk', 'chocolate'],
 ['sausage', 'mustard', 'grill'],
 ['bread', 'apple'],
 ['cheese', 'chicken'],
 ['orange', 'banana', 'apple', 'chicken'],
 ['chicken', 'sausage', 'cheese', 'mustard', 'grill'],
 ['yogurt', 

In [4]:
df = pd.DataFrame({'Customer_ID': range(1, len(data_read) + 1), 'Products': data_read})
df_exploded = df.explode('Products')
dummy_df = pd.crosstab(df_exploded['Customer_ID'], df_exploded['Products'])
dummy_df = dummy_df
dummy_df

Products,apple,banana,beef,bread,butter,cheese,chicken,chocolate,eggs,grill,ketchup,milk,mustard,orange,pork,sausage,wagyu,yogurt
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
5,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0
1913,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1914,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1915,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


To extract rules you can use e.g. apriori algorithm implemented in mlxtend. There are other algorithms performing the same task but using different approaches e.g. fpgrowth internally uses a tree-based structure which makes it faster in most real-life examples.

## **Task 2 - Finding assocaition rules** ##
Find association rules using selected algorithm

### **Apriori algorithm approach** ###

In [56]:
frequent_itemsets = apriori(dummy_df, min_support=0.03, use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.151879,(apple)
1,0.165449,(banana)
2,0.127871,(beef)
3,0.253653,(bread)
4,0.212422,(cheese)
...,...,...
86,0.049061,"(sausage, ketchup, grill)"
87,0.040710,"(sausage, milk, grill)"
88,0.057411,"(mustard, sausage, grill)"
89,0.031315,"(milk, pork, yogurt)"


In [57]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

           antecedents consequents   support  confidence      lift
0              (grill)   (sausage)  0.105428    0.930876  3.746970
1            (ketchup)   (sausage)  0.052192    0.826446  3.326620
2      (apple, banana)    (orange)  0.051148    0.816667  5.031297
3      (apple, orange)    (banana)  0.051148    0.809917  4.895273
4     (banana, orange)     (apple)  0.051148    0.816667  5.377090
5     (grill, chicken)   (sausage)  0.044363    0.934066  3.759812
6   (sausage, ketchup)     (grill)  0.049061    0.940000  8.299724
7     (ketchup, grill)   (sausage)  0.049061    0.989474  3.982839
8        (milk, grill)   (sausage)  0.040710    0.951220  3.828858
9   (mustard, sausage)     (grill)  0.057411    0.948276  8.372795
10    (mustard, grill)   (sausage)  0.057411    1.000000  4.025210


### **Fpgrowth algorithm approach** ###

In [58]:
frequent_itemsets = fpgrowth(dummy_df, min_support=0.03, use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.420668,(chicken)
1,0.400313,(milk)
2,0.179541,(chocolate)
3,0.165449,(banana)
4,0.162317,(orange)
...,...,...
86,0.052192,"(sausage, ketchup)"
87,0.049582,"(ketchup, grill)"
88,0.049061,"(sausage, ketchup, grill)"
89,0.033925,"(milk, eggs)"


In [59]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

           antecedents consequents   support  confidence      lift
0      (apple, banana)    (orange)  0.051148    0.816667  5.031297
1      (apple, orange)    (banana)  0.051148    0.809917  4.895273
2     (banana, orange)     (apple)  0.051148    0.816667  5.377090
3   (mustard, sausage)     (grill)  0.057411    0.948276  8.372795
4     (mustard, grill)   (sausage)  0.057411    1.000000  4.025210
5              (grill)   (sausage)  0.105428    0.930876  3.746970
6     (grill, chicken)   (sausage)  0.044363    0.934066  3.759812
7        (milk, grill)   (sausage)  0.040710    0.951220  3.828858
8            (ketchup)   (sausage)  0.052192    0.826446  3.326620
9   (sausage, ketchup)     (grill)  0.049061    0.940000  8.299724
10    (ketchup, grill)   (sausage)  0.049061    0.989474  3.982839


Both (fpgrowth and apriori algorithm) give the same results

## **Task 3 - Report** ##
The association rules are characterized by high support - frequency in the dataset. Can you use this algorithm as a base and try to extract different types of rules:
 - dissociation rules e.g. buying Porshe and Rolex is not frequent in the dataset, but usually people who bought Porshe also bought Rolex
 - negative rules e.g. if someone bought low-fat milk it's unlikely there will be whole milk in the basket
 - disjunction e.g. eggs and (kielecki xor winiary ;) )
 - imagine 50% of baskets have milk and 50% of baskets have tea. If there is no relation between them then in ~25% of baskets we will have both. If milk appears together with tea in e.g. 40% of baskets it means there is a pattern. Can you find such rules and use statistical tests to check if the relation is strong?

 Send the report within 144 hours starting from the end of this class to gmiebs@cs.put.poznan.pl; start this email's subject with [IR]


### **Association rules** ###

In [60]:
frequent_itemsets = apriori(dummy_df, min_support=0.03, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

           antecedents consequents   support  confidence      lift
0              (grill)   (sausage)  0.105428    0.930876  3.746970
1            (ketchup)   (sausage)  0.052192    0.826446  3.326620
2      (apple, banana)    (orange)  0.051148    0.816667  5.031297
3      (apple, orange)    (banana)  0.051148    0.809917  4.895273
4     (banana, orange)     (apple)  0.051148    0.816667  5.377090
5     (grill, chicken)   (sausage)  0.044363    0.934066  3.759812
6   (sausage, ketchup)     (grill)  0.049061    0.940000  8.299724
7     (ketchup, grill)   (sausage)  0.049061    0.989474  3.982839
8        (milk, grill)   (sausage)  0.040710    0.951220  3.828858
9   (mustard, sausage)     (grill)  0.057411    0.948276  8.372795
10    (mustard, grill)   (sausage)  0.057411    1.000000  4.025210




First we have association rules and derived rules can be divided into two main categories: grill and fruits.

First category consists of rules that are related to grill - we have here products like grill, sausage, chicken, ketchup, mustard. In this category we have some very good rules with good support and very high confidence (exceeding 90%):

1. (mustard, grill) -> sausage
2. (ketchup, grill) -> sausage
3. (mustard, sausage) -> grill
4. (sausage, ketchup) -> grill
5. (grill, chicken) -> sausage
6. (grill) -> sausage

Above rules make much sense - if somebody buys mustard (or ketchup) and grill, then it is logical to buy something to put on the grill and something that you can use mustard (or ketchup) for. The same goes when somebody buys just the grill and additionally buys chicken - it makes no sense to store grill as a furniture at home but it makes sense to buy some sausages.


Fruit category consists of 3 rules (no 2, 3 and 4) - if there are two fruits (apple, orange, banana), it is very likely that third fruit was also purchased. Those rules have support close to 0.05 and its confidence is little above set threshold - rules are solid but work in 80% of situations when 2 fruits were bought:

1. (apple, banana) -> orange
2. (banana, orange) -> apple
3. (apple, orange) -> banana

There is also one rule that does not belong to any of those categories (rule 8) - (milk, grill) -> sausage. In this case it makes much sense to buy sausages after buying grill but it is hard to explain how did milk find here. The other strange fact is very high confidence (95%) - when milk and grill were bought, sausages were very likely to be bought.

### **Dissociation rules** ###

In [67]:
frequent_itemsets = apriori(dummy_df, min_support = 0.015, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.95)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

                    antecedents consequents   support  confidence      lift
0               (cheese, grill)   (sausage)  0.022443    0.955556  3.846312
1            (chocolate, grill)   (sausage)  0.020355    1.000000  4.025210
2              (ketchup, grill)   (sausage)  0.049061    0.989474  3.982839
3                 (milk, grill)   (sausage)  0.040710    0.951220  3.828858
4              (mustard, grill)   (sausage)  0.057411    1.000000  4.025210
5      (apple, sausage, orange)    (banana)  0.015658    1.000000  6.044164
6   (sausage, ketchup, chicken)     (grill)  0.018267    0.972222  8.584229
7     (ketchup, grill, chicken)   (sausage)  0.018267    1.000000  4.025210
8   (mustard, sausage, chicken)     (grill)  0.026096    0.961538  8.489897
9     (mustard, grill, chicken)   (sausage)  0.026096    1.000000  4.025210
10       (milk, ketchup, grill)   (sausage)  0.017745    1.000000  4.025210
11     (mustard, sausage, milk)     (grill)  0.022965    1.000000  8.829493
12       (mu



Those will be the rules with small support (let's say 0.015) but very high confidence (let's say 0.95). Here most rules relate to some sort of grill activities:

1. If somebody bought a grill and some second, unrelated product (cheese, chocoloate, milk) he also bought sausage (rules 0, 1, 3)
2. If seombody bought some grill products (grill, chicken any of the sauces), he also bought sausage or grill (rules 2, 4, 6, 7, 8, 9)
3. Again some questions arise where does milk come from - it is not really grill product but buying it together with grill, sausage or sauces resulted in buying sausage or grill (rules 10, 11, 12)
4. We also have a rule governing fruits and suasage: (apple, sausage, orange) -> banana. To be honest, this is strange mix and I wouldn't suspect such an outcome. This rule probably occured due to lowering the support from 0.03 to 0.015 (in our experiment appeared some rules that are very rare but have very high confidence - once they appeared we can be sure that they work)

Probably lowering support more would result in appearing of many strange rules that make little sense (we can encounter some strange rule e. g. (cheese, wagyu) -> (apple) only because it indeed appeared in the dataset twice and each time cheese and wagyu appeared, then apple was bought; however if something appeared twice in the dataset that contains almost 2 000 records, quality of such a rule can be questionable. This probably is the reason of presence of rule 5 here)

### **Negative rules** ###

We need to update our dummy dataframe so it also contains probabilities that a product was NOT bought

Our approach was to create additional columns which are opposition to what was written in the column with the product (additional 18 features), generate frequent itemsets, find association rules and at the end filter them to extract those that can be presented in such a way: $ A -> \neg B$ where A and B are products. However, the problem is that time needed to calculate everythng was almost 3 hours so we advice not to restart the notebook.

In [5]:
new_df = pd.DataFrame()

for col in dummy_df.columns:
    new_name = 'not_' + col
    new_col = 1 - dummy_df[col]
    new_df[new_name] = new_col

new_df

Unnamed: 0_level_0,not_apple,not_banana,not_beef,not_bread,not_butter,not_cheese,not_chicken,not_chocolate,not_eggs,not_grill,not_ketchup,not_milk,not_mustard,not_orange,not_pork,not_sausage,not_wagyu,not_yogurt
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,0,1,1,1,1,0,0,1,1,1,0,1,0,1,1,1,1
2,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0
3,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1
5,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912,1,1,1,0,1,1,1,0,0,1,1,0,1,1,1,1,1,1
1913,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1
1914,0,1,1,0,1,1,1,1,1,1,1,0,1,1,0,1,1,1
1915,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [6]:
new_df = dummy_df.copy()

for col in dummy_df.columns:
    new_name = 'not_' + col
    new_col = 1 - dummy_df[col]
    new_df[new_name] = new_col

new_df

Products,apple,banana,beef,bread,butter,cheese,chicken,chocolate,eggs,grill,...,not_eggs,not_grill,not_ketchup,not_milk,not_mustard,not_orange,not_pork,not_sausage,not_wagyu,not_yogurt
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,0,0,0,0,1,1,0,0,...,1,1,1,0,1,0,1,1,1,1
2,0,0,0,0,0,1,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
3,0,0,0,0,0,0,1,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,0,1,1,1
5,0,0,0,0,0,0,1,0,0,0,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912,0,0,0,1,0,0,0,1,1,0,...,0,1,1,0,1,1,1,1,1,1
1913,0,0,1,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,1
1914,1,0,0,1,0,0,0,0,0,0,...,1,1,1,0,1,1,0,1,1,1
1915,0,0,1,1,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [27]:
frequent_itemsets = fpgrowth(new_df, min_support = 0.03, use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.993215,(not_wagyu)
1,0.992693,(not_butter)
2,0.936848,(not_ketchup)
3,0.919102,(not_mustard)
4,0.904489,(not_eggs)
...,...,...
1820107,0.030271,"(not_bread, not_ketchup, chicken, eggs, not_pork)"
1820108,0.030271,"(not_bread, not_ketchup, chicken, not_butter, ..."
1820109,0.030271,"(not_bread, not_ketchup, chicken, eggs, not_wa..."
1820110,0.030271,"(not_bread, not_mustard, chicken, eggs, not_wa..."


In [28]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

                                      antecedents               consequents  \
0                                    (not_butter)               (not_wagyu)   
1                                     (not_wagyu)              (not_butter)   
2                                    (not_butter)             (not_ketchup)   
3                                   (not_ketchup)              (not_butter)   
4                                   (not_ketchup)               (not_wagyu)   
...                                           ...                       ...   
57229393    (not_wagyu, not_bread, chicken, eggs)             (not_mustard)   
57229394               (not_bread, chicken, eggs)  (not_mustard, not_wagyu)   
57229395  (not_bread, not_mustard, chicken, eggs)                (not_pork)   
57229396     (not_bread, not_pork, chicken, eggs)             (not_mustard)   
57229397               (not_bread, chicken, eggs)   (not_mustard, not_pork)   

           support  confidence      lift  
0       

In [29]:
negative_rules = rules.copy()
negative_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(not_butter),(not_wagyu),0.992693,0.993215,0.991127,0.998423,1.005243,0.005170,4.301670,0.713834
1,(not_wagyu),(not_butter),0.993215,0.992693,0.991127,0.997898,1.005243,0.005170,3.476253,0.768745
2,(not_butter),(not_ketchup),0.992693,0.936848,0.931628,0.938486,1.001749,0.001626,1.026631,0.238896
3,(not_ketchup),(not_butter),0.936848,0.992693,0.931628,0.994429,1.001749,0.001626,1.311587,0.027641
4,(not_ketchup),(not_wagyu),0.936848,0.993215,0.931628,0.994429,1.001222,0.001137,1.217902,0.019330
...,...,...,...,...,...,...,...,...,...,...
57229393,"(not_wagyu, not_bread, chicken, eggs)",(not_mustard),0.031837,0.919102,0.030271,0.950820,1.034509,0.001010,1.644920,0.034455
57229394,"(not_bread, chicken, eggs)","(not_mustard, not_wagyu)",0.032359,0.912317,0.030271,0.935484,1.025393,0.000750,1.359081,0.025592
57229395,"(not_bread, not_mustard, chicken, eggs)",(not_pork),0.030793,0.794885,0.030271,0.983051,1.236721,0.005794,12.101775,0.197491
57229396,"(not_bread, not_pork, chicken, eggs)",(not_mustard),0.031837,0.919102,0.030271,0.950820,1.034509,0.001010,1.644920,0.034455


In [34]:
negative_rules_filtered = negative_rules[
    negative_rules['antecedents'].apply(lambda x: all(not elem.startswith('not_') for elem in x))
]
negative_rules_filtered = negative_rules_filtered[
    negative_rules_filtered['consequents'].apply(lambda x: all(elem.startswith('not_') for elem in x))
]
negative_rules_filtered[['antecedents', 'consequents', 'support', 'confidence']]

Unnamed: 0,antecedents,consequents,support,confidence
1515961,(chicken),(not_butter),0.416493,0.990074
1515962,(chicken),(not_wagyu),0.416493,0.990074
1515963,(chicken),(not_beef),0.411795,0.978908
1515964,(chicken),(not_pork),0.407620,0.968983
1515965,(chicken),(not_ketchup),0.397704,0.945409
...,...,...,...,...
57222467,"(chicken, eggs)","(not_butter, not_chocolate, not_pork)",0.034447,0.804878
57222494,"(chicken, eggs)","(not_butter, not_chocolate, not_wagyu, not_pork)",0.034447,0.804878
57222709,"(chicken, eggs)","(not_chocolate, not_wagyu, not_grill)",0.034447,0.804878
57223475,"(chicken, eggs)","(not_chocolate, not_ketchup, not_wagyu)",0.034447,0.804878


### **Discjunction** ###

To achieve our goal we decided to:
1. Select all strongly associated pairs where both the antecedent and the consequent have a single item and the confidence is greater than 0.5
2. Select all triplets (representing potential disjunctions) where: 

    a. the antecedent contains two items 

    b. the consequent contains one item

    c. the confidence of the triplet is low (less than or equal to 0.2; if the confidence would be high then it means that confidence of xor operation would be very low)
3. Filter triplets in following way:

    a. decompose the two antecedents and the consequent into two pairs: (antecedent1, consequent), (antecedent2, consequent)

    b. retain only those triplets where both pairs exist in correlated_pairs

In [39]:
frequent_itemsets = fpgrowth(dummy_df, min_support=1/len(dummy_df), use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.420668,(chicken)
1,0.400313,(milk)
2,0.179541,(chocolate)
3,0.165449,(banana)
4,0.162317,(orange)
...,...,...
6977,0.000522,"(orange, bread, butter)"
6978,0.000522,"(orange, cheese, butter)"
6979,0.000522,"(orange, apple, butter)"
6980,0.000522,"(orange, bread, cheese, butter)"


In [42]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = 0.5)

corr_pairs = rules[(rules['antecedents'].str.len() == 1) & (rules['consequents'].str.len() == 1)]
corr_pairs.reset_index(drop=True)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(chocolate),(milk),0.179541,0.400313,0.122651,0.68314,1.706513,0.050779,1.89259,0.504607
1,(yogurt),(milk),0.237474,0.400313,0.129436,0.545055,1.361571,0.034372,1.318152,0.348256
2,(mustard),(sausage),0.080898,0.248434,0.060543,0.748387,3.012415,0.040445,2.986992,0.72684
3,(mustard),(grill),0.080898,0.113257,0.057411,0.709677,6.266092,0.048249,3.054338,0.914382
4,(grill),(mustard),0.113257,0.080898,0.057411,0.506912,6.266092,0.048249,1.863974,0.94775
5,(grill),(sausage),0.113257,0.248434,0.105428,0.930876,3.74697,0.077291,10.872651,0.826753
6,(ketchup),(sausage),0.063152,0.248434,0.052192,0.826446,3.32662,0.036503,4.33045,0.74654
7,(ketchup),(grill),0.063152,0.113257,0.049582,0.785124,6.932247,0.04243,4.126767,0.913432
8,(wagyu),(chicken),0.006785,0.420668,0.004175,0.615385,1.462875,0.001321,1.506263,0.318576
9,(butter),(wagyu),0.007307,0.006785,0.005219,0.714286,105.274725,0.00517,3.476253,0.997792


In [43]:
triplets = rules[(rules['antecedents'].str.len() == 2) & (rules['consequents'].str.len() == 1)]
triplets.reset_index(drop=True)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(chocolate, chicken)",(milk),0.074113,0.400313,0.049061,0.661972,1.653635,0.019392,1.774074,0.426911
1,"(chocolate, yogurt)",(milk),0.043841,0.400313,0.033403,0.761905,1.903272,0.015853,2.518685,0.496350
2,"(chocolate, sausage)",(milk),0.043319,0.400313,0.028706,0.662651,1.655331,0.011364,1.777643,0.413817
3,"(chocolate, cheese)",(milk),0.039144,0.400313,0.028706,0.733333,1.831899,0.013036,2.248826,0.472619
4,"(chocolate, bread)",(milk),0.047495,0.400313,0.033403,0.703297,1.756866,0.014390,2.021167,0.452286
...,...,...,...,...,...,...,...,...,...,...
272,"(orange, butter)",(bread),0.001044,0.253653,0.000522,0.500000,1.971193,0.000257,1.492693,0.493208
273,"(orange, butter)",(cheese),0.001044,0.212422,0.000522,0.500000,2.353808,0.000300,1.575157,0.575758
274,"(orange, butter)",(apple),0.001044,0.151879,0.000522,0.500000,3.292096,0.000363,1.696242,0.696970
275,"(apple, butter)",(orange),0.001044,0.162317,0.000522,0.500000,3.080386,0.000352,1.675365,0.676071


In [44]:
def isValid(row):
    ant = list(row['antecedents'])
    cons = list(row['consequents'])[0] 
    
    # Create the two pairs: (antecedent1, consequent) and (antecedent2, consequent)
    pair1 = (frozenset([ant[0]]), frozenset([cons]))
    pair2 = (frozenset([ant[1]]), frozenset([cons]))

    # Check if both pairs exist in common_pairs as antecedent-consequent pairs
    return (
        any((pair1[0] == a and pair1[1] == c) for a, c in zip(corr_pairs['antecedents'], corr_pairs['consequents']))
        and 
        any((pair2[0] == a and pair2[1] == c) for a, c in zip(corr_pairs['antecedents'], corr_pairs['consequents']))
    )   

filtered_triplets = triplets[triplets.apply(isValid, axis=1)]
filtered_triplets.reset_index(drop=True)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(chocolate, yogurt)",(milk),0.043841,0.400313,0.033403,0.761905,1.903272,0.015853,2.518685,0.49635
1,"(mustard, grill)",(sausage),0.057411,0.248434,0.057411,1.0,4.02521,0.043148,inf,0.797342
2,"(ketchup, grill)",(sausage),0.049582,0.248434,0.049061,0.989474,3.982839,0.036743,71.398747,0.787994
3,"(mustard, ketchup)",(grill),0.002088,0.113257,0.001566,0.75,6.62212,0.001329,3.546973,0.850767
4,"(mustard, ketchup)",(sausage),0.002088,0.248434,0.001566,0.75,3.018908,0.001047,3.006263,0.670153
5,"(butter, wagyu)",(chicken),0.005219,0.420668,0.003653,0.7,1.66402,0.001458,1.931106,0.401139


We see that all of the rules (triplets) have pretty high confidence, thus it is unlikely that there are some valid disjunctions with high confidence

### **Patterns** ###

To find patterns we decided to:
1. First extract all itemsets that contain exactly two items from the all_itemsets DataFrame - representing pairs of items to evaluate for statistical association
2. For each pair of items, calculate:

    a. Observed Co-Occurrence - taken directly from the pair's support value in the dataset

    b. Support of Individual Items - retrieved from the dataset for each item in the pair

    c. Expected Co-Occurrence -  computed as the product of the individual supports of the two items (definition of probability of two independent events)

3. Create contingency table for the pair, representing: Observed Co-Occurrence, Only One Item Present and Neither Item Present.
The values in the table are scaled by the total number of transactions to convert proportions into counts
4. Use the contingency table to perform a chi-squared test of independence
5. Filter Strong Associations:

    a. p-value < 1e-3 - indicates a statistically significant deviation from independence

    b. Observed Co-Occurrence > Expected Co-Occurrence - suggests a positive association between the items.

In [46]:
from scipy.stats import chi2_contingency

all_itemsets = fpgrowth(dummy_df, min_support=1/len(dummy_df), use_colnames=True)
all_pairs = all_itemsets[(all_itemsets['itemsets'].apply(len) == 2)]

results = []
for _, row in all_pairs.iterrows():
    pair = list(row['itemsets'])
    observed = row['support']  # Observed
    support_A = all_itemsets.loc[all_itemsets['itemsets'] == frozenset([pair[0]]), 'support'].values[0]
    support_B = all_itemsets.loc[all_itemsets['itemsets'] == frozenset([pair[1]]), 'support'].values[0]
    expected = support_A * support_B  # Expected

    # Chi-Squared Test
    table = [
        [observed * len(data), (support_A - observed) * len(data)],
        [(support_B - observed) * len(data), (1 - (support_A + support_B - observed)) * len(data)]
    ]
    chi2, p_value, _, _ = chi2_contingency(table)

    results.append({
        'pair': pair,
        'observed': observed,
        'expected': expected,
        'chi2': chi2,
        'p_value': p_value
    })

results_df = pd.DataFrame(results)
strong_associations = results_df[(results_df['p_value'] < 1e-3) & (results_df['observed'] > results_df['expected'])] # if observed probability is higher than expected 
                                                                                                                    # one we can predict that there is a pattern

print("Detected patterns in data:")
strong_associations.reset_index(drop=True)



Detected patterns in data:


Unnamed: 0,pair,observed,expected,chi2,p_value
0,"[chocolate, milk]",0.122651,0.071873,4530.278111,0.0
1,"[banana, sausage]",0.043319,0.041103,11.755121,0.0006067606
2,"[bread, banana]",0.047495,0.041967,72.44884,1.7141870000000002e-17
3,"[orange, banana]",0.06263,0.026855,4234.97024,0.0
4,"[orange, cheese]",0.037578,0.03448,26.095568,3.249291e-07
5,"[orange, pork]",0.035491,0.033294,13.431837,0.0002473898
6,"[orange, yogurt]",0.042276,0.038546,34.957367,3.370039e-09
7,"[orange, apple]",0.063152,0.024653,5257.475737,0.0
8,"[banana, apple]",0.06263,0.025128,4912.451253,0.0
9,"[bread, apple]",0.041754,0.038525,26.441468,2.716457e-07
