In [1]:
import numpy as np
dataset_filename = "affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))

This dataset has 100 samples and 5 features


In [16]:
print(X[:5])

[[ 1.  0.  1.  0.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 1.  1.  0.  0.  1.]
 [ 0.  1.  0.  1.  0.]
 [ 0.  1.  0.  0.  1.]]


In [3]:
# The names of the features, for your reference.
features = ["bread", "milk", "cheese", "apples", "bananas"]

In our first example, we will compute the Support and Confidence of the rule "If a person buys Apples, they also buy Bananas".

In [2]:
# First, how many rows contain our premise: that a person is buying apples
num_apple_purchases = 0
for sample in X:
    if sample[3] == 1:  # This person bought Apples
        num_apple_purchases += 1
print("{0} people bought Apples".format(num_apple_purchases))

43 people bought Apples


In [4]:
# How many of the cases that a person bought Apples involved the people purchasing Bananas too?
# Record both cases where the rule is valid and is invalid.
rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3] == 1:  # This person bought Apples
        if sample[4] == 1:
            # This person bought both Apples and Bananas
            rule_valid += 1
        else:
            # This person bought Apples, but not Bananas
            rule_invalid += 1
print("{0} cases of the rule being valid were discovered".format(rule_valid))
print("{0} cases of the rule being invalid were discovered".format(rule_invalid))

27 cases of the rule being valid were discovered
16 cases of the rule being invalid were discovered


In [6]:
# Now we have all the information needed to compute Support and Confidence
support = rule_valid  # The Support is the number of times the rule is discovered.
confidence = rule_valid / num_apple_purchases
print("The support is {0} and the confidence is {1:.3f}.".format(support, confidence))
# Confidence can be thought of as a percentage using the following:
print("As a percentage, that is {0:.2f}%.".format(100 * confidence))

The support is 27 and the confidence is 0.628.
As a percentage, that is 62.79%.


In [11]:
from collections import defaultdict
# Now compute for all possible rules
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
        # Record that the premise was bought in another transaction
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:  # It makes little sense to measure if X -> X.
                continue
            if sample[conclusion] == 1:
                # This person also bought the conclusion item
                valid_rules[(premise, conclusion)] += 1
            else:
                # This person bought the premise, but not the conclusion
                invalid_rules[(premise, conclusion)] += 1
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
valid_rules
confidence

defaultdict(float,
            {(0, 1): 0.4642857142857143,
             (0, 2): 0.17857142857142858,
             (0, 3): 0.32142857142857145,
             (0, 4): 0.5714285714285714,
             (1, 0): 0.25,
             (1, 2): 0.21153846153846154,
             (1, 3): 0.34615384615384615,
             (1, 4): 0.5192307692307693,
             (2, 0): 0.1282051282051282,
             (2, 1): 0.28205128205128205,
             (2, 3): 0.5641025641025641,
             (2, 4): 0.5128205128205128,
             (3, 0): 0.20930232558139536,
             (3, 1): 0.4186046511627907,
             (3, 2): 0.5116279069767442,
             (3, 4): 0.627906976744186,
             (4, 0): 0.2807017543859649,
             (4, 1): 0.47368421052631576,
             (4, 2): 0.3508771929824561,
             (4, 3): 0.47368421052631576})

In [18]:
for premise, conclusion in confidence.items():
    print(premise)
    print(conclusion)

(0, 1)
0.4642857142857143
(1, 2)
0.21153846153846154
(3, 2)
0.5116279069767442
(1, 3)
0.34615384615384615
(3, 0)
0.20930232558139536
(3, 4)
0.627906976744186
(3, 1)
0.4186046511627907
(1, 4)
0.5192307692307693
(2, 4)
0.5128205128205128
(2, 0)
0.1282051282051282
(2, 3)
0.5641025641025641
(2, 1)
0.28205128205128205
(4, 3)
0.47368421052631576
(0, 4)
0.5714285714285714
(1, 0)
0.25
(4, 2)
0.3508771929824561
(0, 3)
0.32142857142857145
(4, 1)
0.47368421052631576
(0, 2)
0.17857142857142858
(4, 0)
0.2807017543859649


In [13]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

Rule: If a person buys bread they will also buy milk
 - Confidence: 0.464
 - Support: 13

Rule: If a person buys milk they will also buy cheese
 - Confidence: 0.212
 - Support: 11

Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.512
 - Support: 22

Rule: If a person buys milk they will also buy apples
 - Confidence: 0.346
 - Support: 18

Rule: If a person buys apples they will also buy bread
 - Confidence: 0.209
 - Support: 9

Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.628
 - Support: 27

Rule: If a person buys apples they will also buy milk
 - Confidence: 0.419
 - Support: 18

Rule: If a person buys milk they will also buy bananas
 - Confidence: 0.519
 - Support: 27

Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.513
 - Support: 20

Rule: If a person buys cheese they will also buy bread
 - Confidence: 0.128
 - Support: 5

Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.564
 - Su

In [26]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

In [24]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buys milk they will also buy apples
 - Confidence: 0.226
 - Support: 12



In [19]:
support.items()

dict_items([((0, 1), 13), ((1, 2), 11), ((3, 2), 22), ((1, 3), 18), ((0, 3), 9), ((3, 0), 9), ((4, 1), 27), ((3, 1), 18), ((1, 4), 27), ((0, 2), 5), ((2, 0), 5), ((2, 3), 22), ((2, 1), 11), ((4, 3), 27), ((0, 4), 16), ((4, 2), 20), ((1, 0), 13), ((3, 4), 27), ((2, 4), 20), ((4, 0), 16)])

In [25]:
# Sort by support
from pprint import pprint
pprint(list(support.items()))

[((0, 1), 12),
 ((1, 2), 14),
 ((3, 2), 26),
 ((1, 3), 12),
 ((0, 3), 4),
 ((3, 0), 4),
 ((4, 1), 30),
 ((3, 1), 12),
 ((1, 4), 30),
 ((2, 4), 30),
 ((2, 0), 5),
 ((2, 3), 26),
 ((2, 1), 14),
 ((4, 3), 27),
 ((0, 4), 15),
 ((4, 2), 30),
 ((1, 0), 12),
 ((3, 4), 27),
 ((0, 2), 5),
 ((4, 0), 15)]


In [24]:
a=[1,5,2,3]
b = sorted(a)
b

[1, 2, 3, 5]

In [21]:
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(0), reverse=True)
sorted_support

[((4, 3), 27),
 ((4, 2), 20),
 ((4, 1), 27),
 ((4, 0), 16),
 ((3, 4), 27),
 ((3, 2), 22),
 ((3, 1), 18),
 ((3, 0), 9),
 ((2, 4), 20),
 ((2, 3), 22),
 ((2, 1), 11),
 ((2, 0), 5),
 ((1, 4), 27),
 ((1, 3), 18),
 ((1, 2), 11),
 ((1, 0), 13),
 ((0, 4), 16),
 ((0, 3), 9),
 ((0, 2), 5),
 ((0, 1), 13)]

In [27]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys bananas they will also buy apples
 - Confidence: 0.474
 - Support: 27

Rule #2
Rule: If a person buys bananas they will also buy cheese
 - Confidence: 0.351
 - Support: 20

Rule #3
Rule: If a person buys bananas they will also buy milk
 - Confidence: 0.474
 - Support: 27

Rule #4
Rule: If a person buys bananas they will also buy bread
 - Confidence: 0.281
 - Support: 16

Rule #5
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.628
 - Support: 27



In [28]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

In [29]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.730
 - Support: 27

Rule #2
Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.703
 - Support: 26

Rule #3
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.652
 - Support: 30

Rule #4
Rule: If a person buys milk they will also buy bananas
 - Confidence: 0.566
 - Support: 30

Rule #5
Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.565
 - Support: 26

