In [11]:
import numpy as np
dataset_filename = "affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))

This dataset has 100 samples and 5 features


In [12]:
print(X[:5])

[[ 0.  0.  1.  1.  1.]
 [ 1.  1.  0.  1.  0.]
 [ 1.  0.  1.  1.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 0.  1.  0.  0.  1.]]


In [13]:
# The names of the features
features = ["bread", "milk", "cheese", "apples", "bananas"]

Support and Confidence

Support is the number of times that a rule occurs in a dataset, which is computed by simply counting the number of samples that the rule is valid for.

While the support measures how often a rule exists, confidence measures how accurate they are when they can be used. It can be compulated by determining the percentage of times the rule applies when the premise applies.

In [14]:
# Rule: if a person buys apples, they also buy bananas
# First, how many rows contain our premise: that a person is buying apples
num_apple_purchases = 0
for sample in X:
    if sample[3]:
        num_apple_purchases += 1
print "{0} people bought apples.".format(num_apple_purchases)

36 people bought apples.


In [15]:
rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3]:
        if sample[4]:
            rule_valid += 1
        else:
            rule_invalid += 1

print "{0} cases of the rule being valid were discovered".format(rule_valid)
print "{0} cases of the rule being invalid were discovered".format(rule_invalid)

21 cases of the rule being valid were discovered
15 cases of the rule being invalid were discovered


In [21]:
# Now we have
support = rule_valid
confidence = float(rule_valid) / float(num_apple_purchases)

print "confidence {0}".format(confidence)
print("The support is {0} and the confidence is {1:.3f}.".format(support, confidence))

print("As a percentage, that is {0:.1f}%".format(100*confidence))

confidence 0.583333333333
The support is 21 and the confidence is 0.583.
As a percentage, that is 58.3%


In [26]:
from collections import defaultdict
# Now compute for all possible rules
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
            # Record that the premise was bought in another transaction
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:
                continue
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1

support = valid_rules
confidence = defaultdict(float)

for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = float(valid_rules[(premise, conclusion)]) / num_occurences[premise]

In [27]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print("- Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print("- Support: {0}".format(valid_rules[(premise, conclusion)]))
    print("")

Rule: If a person buys bread they will also buy milk
- Confidence: 0.519
- Support: 14

Rule: If a person buys milk they will also buy cheese
- Confidence: 0.152
- Support: 7

Rule: If a person buys apples they will also buy cheese
- Confidence: 0.694
- Support: 25

Rule: If a person buys milk they will also buy apples
- Confidence: 0.196
- Support: 9

Rule: If a person buys bread they will also buy apples
- Confidence: 0.185
- Support: 5

Rule: If a person buys apples they will also buy bread
- Confidence: 0.139
- Support: 5

Rule: If a person buys apples they will also buy bananas
- Confidence: 0.583
- Support: 21

Rule: If a person buys apples they will also buy milk
- Confidence: 0.250
- Support: 9

Rule: If a person buys milk they will also buy bananas
- Confidence: 0.413
- Support: 19

Rule: If a person buys cheese they will also buy bananas
- Confidence: 0.659
- Support: 27

Rule: If a person buys cheese they will also buy bread
- Confidence: 0.098
- Support: 4

Rule: If a perso

In [28]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print("- Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print("- Support: {0}".format(valid_rules[(premise, conclusion)]))
    print("")  

In [29]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buys milk they will also buy apples
- Confidence: 0.196
- Support: 9



In [30]:
# Sort by support
from pprint import pprint
pprint(list(support.items()))

[((0, 1), 14),
 ((1, 2), 7),
 ((3, 2), 25),
 ((1, 3), 9),
 ((3, 0), 5),
 ((4, 1), 19),
 ((3, 1), 9),
 ((1, 4), 19),
 ((0, 2), 4),
 ((2, 0), 4),
 ((2, 3), 25),
 ((2, 1), 7),
 ((4, 3), 21),
 ((0, 4), 17),
 ((1, 0), 14),
 ((4, 2), 27),
 ((0, 3), 5),
 ((3, 4), 21),
 ((2, 4), 27),
 ((4, 0), 17)]


In [37]:
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
print sorted_support

[((4, 2), 27), ((2, 4), 27), ((3, 2), 25), ((2, 3), 25), ((4, 3), 21), ((3, 4), 21), ((4, 1), 19), ((1, 4), 19), ((0, 4), 17), ((4, 0), 17), ((0, 1), 14), ((1, 0), 14), ((1, 3), 9), ((3, 1), 9), ((1, 2), 7), ((2, 1), 7), ((3, 0), 5), ((0, 3), 5), ((0, 2), 4), ((2, 0), 4)]


In [33]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys bananas they will also buy cheese
- Confidence: 0.458
- Support: 27

Rule #2
Rule: If a person buys cheese they will also buy bananas
- Confidence: 0.659
- Support: 27

Rule #3
Rule: If a person buys apples they will also buy cheese
- Confidence: 0.694
- Support: 25

Rule #4
Rule: If a person buys cheese they will also buy apples
- Confidence: 0.610
- Support: 25

Rule #5
Rule: If a person buys bananas they will also buy apples
- Confidence: 0.356
- Support: 21



In [38]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

In [39]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    premise, conclusion = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys apples they will also buy cheese
- Confidence: 0.694
- Support: 25

Rule #2
Rule: If a person buys cheese they will also buy bananas
- Confidence: 0.659
- Support: 27

Rule #3
Rule: If a person buys bread they will also buy bananas
- Confidence: 0.630
- Support: 17

Rule #4
Rule: If a person buys cheese they will also buy apples
- Confidence: 0.610
- Support: 25

Rule #5
Rule: If a person buys apples they will also buy bananas
- Confidence: 0.583
- Support: 21



In [40]:
# Example of using itemgetter() to retrieve specific fields from a tuple record
inventory = [('apple', 3), ('banana', 2), ('pear', 5), ('orange', 1)]
getcount = itemgetter(1)
map(getcount, inventory)

[3, 2, 5, 1]

In [41]:
sorted(inventory, key=getcount, reverse=True)

[('pear', 5), ('apple', 3), ('banana', 2), ('orange', 1)]