In [1]:
import numpy as np

In [2]:
dataset_filename = 'affinity_dataset.txt'

In [3]:
X = np.loadtxt(dataset_filename)

In [4]:
X

array([[ 0.,  0.,  1.,  1.,  1.],
       [ 1.,  1.,  0.,  1.,  0.],
       [ 1.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  1.,  1.,  1.],
       [ 0.,  1.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.,  1.],
       [ 0.,  0.,  1.,  1.,  1.],
       [ 1.,  1.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  1.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  1.],
       [ 1.,  1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  1.],
       [ 0.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  

In [5]:
num_apple_purchases = 0

In [6]:
for sample in X:
    if sample[3] == 1:
        num_apple_purchases += 1

In [7]:
num_apple_purchases

36

In [8]:
num_banana_purchases = 0

In [9]:
for sample in X:
    if sample[4] == 1:
        num_banana_purchases += 1

In [10]:
num_banana_purchases

59

In [11]:
from collections import defaultdict

In [12]:
valid_rules = defaultdict(int)

In [13]:
invalid_rules = defaultdict(int)
num_occurances = defaultdict(int)

In [14]:
for sample in X:
    for premise in range(4):
        if sample[premise] == 0: continue
        num_occurances[premise] += 1
        for conclusion in range(4):
            if premise == conclusion: continue
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1

In [15]:
valid_rules

defaultdict(int,
            {(0, 1): 14,
             (0, 2): 4,
             (0, 3): 5,
             (1, 0): 14,
             (1, 2): 7,
             (1, 3): 9,
             (2, 0): 4,
             (2, 1): 7,
             (2, 3): 25,
             (3, 0): 5,
             (3, 1): 9,
             (3, 2): 25})

In [16]:
confidence = defaultdict(float)

In [17]:
for premise, conclusion in valid_rules.keys():
    rule = (premise, conclusion)
    confidence[rule] = valid_rules[rule] / num_occurances[premise]

In [18]:
confidence

defaultdict(float,
            {(0, 1): 0.5185185185185185,
             (0, 2): 0.14814814814814814,
             (0, 3): 0.18518518518518517,
             (1, 0): 0.30434782608695654,
             (1, 2): 0.15217391304347827,
             (1, 3): 0.1956521739130435,
             (2, 0): 0.0975609756097561,
             (2, 1): 0.17073170731707318,
             (2, 3): 0.6097560975609756,
             (3, 0): 0.1388888888888889,
             (3, 1): 0.25,
             (3, 2): 0.6944444444444444})

In [19]:
support = valid_rules

In [20]:
support

defaultdict(int,
            {(0, 1): 14,
             (0, 2): 4,
             (0, 3): 5,
             (1, 0): 14,
             (1, 2): 7,
             (1, 3): 9,
             (2, 0): 4,
             (2, 1): 7,
             (2, 3): 25,
             (3, 0): 5,
             (3, 1): 9,
             (3, 2): 25})

In [21]:
from operator import itemgetter

In [22]:
sorted(support.items(), key=itemgetter(1), reverse=True)

[((2, 3), 25),
 ((3, 2), 25),
 ((0, 1), 14),
 ((1, 0), 14),
 ((1, 3), 9),
 ((3, 1), 9),
 ((1, 2), 7),
 ((2, 1), 7),
 ((0, 3), 5),
 ((3, 0), 5),
 ((0, 2), 4),
 ((2, 0), 4)]

In [23]:
from sklearn.datasets import load_iris

In [24]:
dataset = load_iris()

In [25]:
X = dataset.data
y = dataset.target

In [26]:
attribute_means = X.mean(axis=0)

In [27]:
attribute_means

array([ 5.84333333,  3.054     ,  3.75866667,  1.19866667])

In [28]:
X_d = np.array(X >= attribute_means, dtype='int')

In [29]:
def train_feature_value(X, y_true, feature_index, value):
    class_counts = defaultdict(int)
    for sample, y in zip(X, y_true):
        if sample[feature_index] == value:
            class_counts[y] += 1
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    incorrect_predictions = [class_count for class_value, class_count in class_counts.items() if class_value != most_frequent_class]
    error = sum(incorrect_predictions)
    return most_frequent_class, error

In [30]:
def train_on_feature(X, y_true, feature_index):
    values = set(X[:, feature_index])
    predictors = {}
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature_index, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    total_error = sum(errors)
    return predictors, total_error

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
Xd_train, Xd_test, y_train, y_test = train_test_split(X_d, y, random_state=14)

In [33]:
all_predictors = {}
errors = {}

In [34]:
for feature_index in range(Xd_train.shape[1]):
    predictors, total_error = train_on_feature(Xd_train, y_train, feature_index)
    all_predictors[feature_index] = predictors
    errors[feature_index] = total_error

In [35]:
best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]

In [36]:
model = {'feature': best_feature, 'predictor': all_predictors[best_feature]}

In [37]:
def predict(X_test, model):
    variable = model['feature']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted

In [38]:
y_predicted = predict(Xd_test, model)

In [39]:
np.mean(y_predicted == y_test) * 100

65.789473684210535