In [1]:
from random import choice as random_choice
from pandas import DataFrame as Table
from functools import reduce
from collections import Counter as CounterDictionary
from pandas import read_csv

In [2]:
w = 'ABCDEF'
ITEMS = set([ w[i] for i in range(len(w)) ])

In [3]:
ITEMS

{'A', 'B', 'C', 'D', 'E', 'F'}

In [4]:
N = 20
MIN_SAMPLES = 3
MAX_SAMPLES = 10

In [5]:
TD_AS_LIST = [ set([ random_choice(tuple(ITEMS)) for _ in range(random_choice(range(MIN_SAMPLES, MAX_SAMPLES)))]) for _ in range(N)  ]
TD_AS_DICTIONARY = { frozenset(s): len(list(filter(lambda x: x == s,  TD_AS_LIST)))  for s in TD_AS_LIST  }

In [6]:
TD_AS_LIST

[{'C', 'D', 'F'},
 {'A', 'B', 'C', 'E', 'F'},
 {'A', 'F'},
 {'B', 'C', 'D', 'E', 'F'},
 {'B', 'F'},
 {'A', 'C', 'D', 'E', 'F'},
 {'B', 'C', 'D', 'E', 'F'},
 {'B', 'C', 'E', 'F'},
 {'A', 'B', 'C', 'E'},
 {'A', 'B', 'C', 'D', 'E'},
 {'A', 'C', 'E', 'F'},
 {'D', 'E', 'F'},
 {'A', 'D', 'E', 'F'},
 {'B', 'C', 'D', 'E', 'F'},
 {'A', 'B', 'D', 'E'},
 {'B', 'C', 'D', 'E'},
 {'B', 'C'},
 {'A', 'D'},
 {'A', 'D', 'E', 'F'},
 {'A', 'C', 'D', 'F'}]

In [7]:
TD_AS_DICTIONARY

{frozenset({'C', 'D', 'F'}): 1,
 frozenset({'A', 'B', 'C', 'E', 'F'}): 1,
 frozenset({'A', 'F'}): 1,
 frozenset({'B', 'C', 'D', 'E', 'F'}): 3,
 frozenset({'B', 'F'}): 1,
 frozenset({'A', 'C', 'D', 'E', 'F'}): 1,
 frozenset({'B', 'C', 'E', 'F'}): 1,
 frozenset({'A', 'B', 'C', 'E'}): 1,
 frozenset({'A', 'B', 'C', 'D', 'E'}): 1,
 frozenset({'A', 'C', 'E', 'F'}): 1,
 frozenset({'D', 'E', 'F'}): 1,
 frozenset({'A', 'D', 'E', 'F'}): 2,
 frozenset({'A', 'B', 'D', 'E'}): 1,
 frozenset({'B', 'C', 'D', 'E'}): 1,
 frozenset({'B', 'C'}): 1,
 frozenset({'A', 'D'}): 1,
 frozenset({'A', 'C', 'D', 'F'}): 1}

In [8]:
def transaction_dataset(s):
    return TD_AS_DICTIONARY[frozenset(s)] if frozenset(s) in TD_AS_DICTIONARY else 0

In [9]:
ITEM_LIST = list(ITEMS)
ITEM_LIST.sort()

In [10]:
TD_AS_TABLE = Table([{ item: item in transaction  for item in list(ITEM_LIST)    }  for transaction in TD_AS_LIST])

In [11]:
TD_AS_TABLE

Unnamed: 0,A,B,C,D,E,F
0,False,False,True,True,False,True
1,True,True,True,False,True,True
2,True,False,False,False,False,True
3,False,True,True,True,True,True
4,False,True,False,False,False,True
5,True,False,True,True,True,True
6,False,True,True,True,True,True
7,False,True,True,False,True,True
8,True,True,True,False,True,False
9,True,True,True,True,True,False


In [12]:
TD_AS_TABLE['A'].mean()

0.55

In [13]:
(TD_AS_TABLE['A'] & TD_AS_TABLE['B']).mean()

0.2

In [14]:
def support(td, tr):
    return reduce(lambda x, y: x & y, [td[it] for it in tr]).mean()

In [15]:
support(TD_AS_TABLE, ['A','B', 'C'])

0.15

In [16]:
EPSILON= 0.25

In [17]:
def next_candidates(itemsets):
    if len(itemsets) == 0: return set()
    r = set()
    k = len(list(itemsets)[0])
    for its in itemsets:
        nns = set(filter(lambda x: len(its.difference(x)) == 1, itemsets)) # nns: nearest neighbours
        nn_diff = [list(nn.difference(its))[0] for nn in nns]
        nn_diff_count = CounterDictionary(nn_diff).items()
        extensions = frozenset([frozenset([pair[0]]) for pair in list(filter(lambda pair: pair[1] == k, nn_diff_count))])
        for singleton in extensions: r.add( frozenset(its.union(singleton)))
    return r    

In [18]:
def apriori(td, epsilon):
    fi = set()
    items = set(td.columns)
    candidates = set([frozenset([it]) for it in items])
    condition = True
    while(condition):
        fi_k = set( filter(lambda x: support(td, x) >= epsilon , candidates) )
        fi = fi.union(fi_k)
        candidates = next_candidates(fi_k)
        condition = len(candidates) > 0
    return fi    

In [19]:
apriori(TD_AS_TABLE, EPSILON)

{frozenset({'C', 'E'}),
 frozenset({'C', 'D', 'E'}),
 frozenset({'B', 'C', 'F'}),
 frozenset({'A', 'D'}),
 frozenset({'E'}),
 frozenset({'B'}),
 frozenset({'C', 'F'}),
 frozenset({'A'}),
 frozenset({'B', 'E'}),
 frozenset({'B', 'C', 'D', 'E'}),
 frozenset({'B', 'E', 'F'}),
 frozenset({'A', 'F'}),
 frozenset({'C', 'D', 'F'}),
 frozenset({'B', 'C', 'E', 'F'}),
 frozenset({'D'}),
 frozenset({'A', 'D', 'E'}),
 frozenset({'D', 'F'}),
 frozenset({'B', 'D', 'E'}),
 frozenset({'C'}),
 frozenset({'B', 'C', 'E'}),
 frozenset({'D', 'E', 'F'}),
 frozenset({'C', 'D'}),
 frozenset({'A', 'E'}),
 frozenset({'B', 'D'}),
 frozenset({'B', 'C', 'D'}),
 frozenset({'A', 'C', 'E'}),
 frozenset({'D', 'E'}),
 frozenset({'A', 'C'}),
 frozenset({'F'}),
 frozenset({'E', 'F'}),
 frozenset({'A', 'E', 'F'}),
 frozenset({'C', 'E', 'F'}),
 frozenset({'B', 'C'}),
 frozenset({'B', 'F'})}

In [20]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data

--2023-03-27 10:12:27--  https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51867 (51K) [application/x-httpd-php]
Saving to: ‘car.data.1’


2023-03-27 10:12:29 (145 KB/s) - ‘car.data.1’ saved [51867/51867]



In [21]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.names

--2023-03-27 10:12:29--  https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.names
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3097 (3,0K) [application/x-httpd-php]
Saving to: ‘car.names.1’


2023-03-27 10:12:30 (29,2 MB/s) - ‘car.names.1’ saved [3097/3097]



In [22]:
! cat car.names

1. Title: Car Evaluation Database

2. Sources:
   (a) Creator: Marko Bohanec
   (b) Donors: Marko Bohanec   (marko.bohanec@ijs.si)
               Blaz Zupan      (blaz.zupan@ijs.si)
   (c) Date: June, 1997

3. Past Usage:

   The hierarchical decision model, from which this dataset is
   derived, was first presented in 

   M. Bohanec and V. Rajkovic: Knowledge acquisition and explanation for
   multi-attribute decision making. In 8th Intl Workshop on Expert
   Systems and their Applications, Avignon, France. pages 59-78, 1988.

   Within machine-learning, this dataset was used for the evaluation
   of HINT (Hierarchy INduction Tool), which was proved to be able to
   completely reconstruct the original hierarchical model. This,
   together with a comparison with C4.5, is presented in

   B. Zupan, M. Bohanec, I. Bratko, J. Demsar: Machine learning by
   function decomposition. ICML-97, Nashville, TN. 1997 (to appear)

4. Relevant Information Paragraph:

   Car Evaluation Database was 

In [23]:

t = read_csv("car.data", sep=',',names=['bprice','mprice','doors','persons', 'lug_boot', 'safety', 'score' ])

In [24]:
import pandas as pd

In [25]:
pd.get_dummies(t)

Unnamed: 0,bprice_high,bprice_low,bprice_med,bprice_vhigh,mprice_high,mprice_low,mprice_med,mprice_vhigh,doors_2,doors_3,...,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med,score_acc,score_good,score_unacc,score_vgood
0,0,0,0,1,0,0,0,1,1,0,...,0,0,1,0,1,0,0,0,1,0
1,0,0,0,1,0,0,0,1,1,0,...,0,0,1,0,0,1,0,0,1,0
2,0,0,0,1,0,0,0,1,1,0,...,0,0,1,1,0,0,0,0,1,0
3,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,1,0,0,0,1,0
4,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
1724,0,1,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
1725,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
1726,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0


In [26]:
ts = pd.get_dummies(t)
apriori(ts,0.05)

{frozenset({'doors_4', 'persons_more'}),
 frozenset({'lug_boot_big', 'persons_2'}),
 frozenset({'bprice_med', 'safety_low', 'score_unacc'}),
 frozenset({'bprice_med', 'doors_5more'}),
 frozenset({'doors_3', 'lug_boot_small', 'score_unacc'}),
 frozenset({'bprice_vhigh', 'mprice_vhigh', 'score_unacc'}),
 frozenset({'mprice_low', 'score_acc'}),
 frozenset({'bprice_med', 'safety_med'}),
 frozenset({'lug_boot_big', 'persons_4', 'score_unacc'}),
 frozenset({'mprice_low'}),
 frozenset({'lug_boot_big', 'mprice_high'}),
 frozenset({'doors_2', 'persons_2', 'score_unacc'}),
 frozenset({'bprice_high', 'safety_low'}),
 frozenset({'bprice_low', 'persons_2', 'score_unacc'}),
 frozenset({'doors_5more', 'lug_boot_med'}),
 frozenset({'lug_boot_med', 'score_unacc'}),
 frozenset({'bprice_vhigh', 'lug_boot_med', 'score_unacc'}),
 frozenset({'bprice_high', 'mprice_vhigh', 'score_unacc'}),
 frozenset({'doors_5more', 'score_unacc'}),
 frozenset({'persons_2'}),
 frozenset({'persons_2', 'score_unacc'}),
 frozen

In [27]:
def confidence(td,x,y):
    return support(td, x.union(y)) / support(td, x)
def lift(td,x,y):
    return support(td, x.union(y)) / (support(td, x)*support(td, y))

In [28]:
x, y = frozenset({'bprice_high', 'safety_low'}), frozenset({ 'score_unacc'})

In [29]:
confidence(ts, x, y), lift(ts, x, y)

(1.0, 1.428099173553719)