套路：
- 准备数据
- 实现算法
- 测试算法

# 任务1：亲和性分析
- 如果一个顾客买了商品X，那么他们可能愿意买商品Y

衡量方法：
- 支持度support := 所有买X的人数

- 置信度confidence := $\frac{所有买X和Y的人数}{所有买X的人数}$

In [1]:
# 引入库
import numpy as np
from operator import itemgetter

In [2]:
# 准备数据

# 创造随机生成的数据　(可跳过)
X = np.zeros((100, 5), dtype='bool')
for i in range(X.shape[0]):
    if np.random.random() < 0.3:
        # A bread winner
        X[i][0] = 1
        if np.random.random() < 0.5:
            # Who likes milk
            X[i][1] = 1
        if np.random.random() < 0.2:
            # Who likes cheese
            X[i][2] = 1
        if np.random.random() < 0.25:
            # Who likes apples
            X[i][3] = 1
        if np.random.random() < 0.5:
            # Who likes bananas
            X[i][4] = 1
    else:
        # Not a bread winner
        if np.random.random() < 0.5:
            # Who likes milk
            X[i][1] = 1
            if np.random.random() < 0.2:
                # Who likes cheese
                X[i][2] = 1
            if np.random.random() < 0.25:
                # Who likes apples
                X[i][3] = 1
            if np.random.random() < 0.5:
                # Who likes bananas
                X[i][4] = 1
        else:
            if np.random.random() < 0.8:
                # Who likes cheese
                X[i][2] = 1
            if np.random.random() < 0.6:
                # Who likes apples
                X[i][3] = 1
            if np.random.random() < 0.7:
                # Who likes bananas
                X[i][4] = 1
    if X[i].sum() == 0:
        X[i][4] = 1  # Must buy something, so gets bananas
np.savetxt("./data/affinity_dataset.txt", X, fmt='%d') # 保存


# 读取数据
dataset_filename = "./data/affinity_dataset.txt"
X = np.loadtxt(dataset_filename) # 加载数据
n_samples, n_features = X.shape
print(X.shape)
print(X[:5])

(100, 5)
[[ 0.  1.  1.  0.  1.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 0.  0.  1.  1.  1.]
 [ 0.  1.  0.  1.  0.]]


In [3]:
# 文件affinity_dataset.txt是生成的数据，得我们来指定列
features = ["bread", "milk", "cheese", "apples", "bananas"]

num_apple_purchases = 0 # 计数
for sample in X:
    if sample[3] == 1:  # 记录买 Apples　的有多少人
        num_apple_purchases += 1
print("买苹果的有{0}人".format(num_apple_purchases))

rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3] == 1:  # 买了苹果
        if sample[4] == 1:# 又买香蕉的
            rule_valid += 1
        else:# 不买香蕉的
            rule_invalid += 1
print("买了苹果又买香蕉的有{0}人".format(rule_valid))
print("买了苹果不买香蕉的有{0}人".format(rule_invalid))

# 计算支持度support和置信度confidence
support = rule_valid  # The Support is the number of times the rule is discovered.
confidence = rule_valid / num_apple_purchases
print("支持度support = {0} 置信度confidence = {1:.3f}.".format(support, confidence))
# Confidence can be thought of as a percentage using the following:
print("置信度confidence的百分比形式为 {0:.1f}%.".format(100 * confidence))

买苹果的有39人
买了苹果又买香蕉的有24人
买了苹果不买香蕉的有15人
支持度support = 24 置信度confidence = 0.615.
置信度confidence的百分比形式为 61.5%.


In [4]:
from collections import defaultdict
# 上面"买了苹果又买香蕉"是一种情况，现在把所有可能的情况都做一遍
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
        # 先买premise，premise代表一种食物，记做X
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:  
                continue # 跳过买X又买X的情况
            if sample[conclusion] == 1: # 又买了conclusion，conclusion代表一种食物，记做Y
                valid_rules[(premise, conclusion)] += 1 # 买X买Y
            else: 
                invalid_rules[(premise, conclusion)] += 1 # 买X没买Y
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]

In [5]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: 买了{0}，又买{1}".format(premise_name, conclusion_name))
    print(" - 置信度Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - 支持度Support: {0}".format(support[(premise, conclusion)]))
    print("")

Rule: 买了milk，又买cheese
 - 置信度Confidence: 0.174
 - 支持度Support: 8

Rule: 买了bread，又买milk
 - 置信度Confidence: 0.535
 - 支持度Support: 23

Rule: 买了apples，又买cheese
 - 置信度Confidence: 0.487
 - 支持度Support: 19

Rule: 买了milk，又买apples
 - 置信度Confidence: 0.326
 - 支持度Support: 15

Rule: 买了bread，又买apples
 - 置信度Confidence: 0.256
 - 支持度Support: 11

Rule: 买了apples，又买bread
 - 置信度Confidence: 0.282
 - 支持度Support: 11

Rule: 买了apples，又买bananas
 - 置信度Confidence: 0.615
 - 支持度Support: 24

Rule: 买了apples，又买milk
 - 置信度Confidence: 0.385
 - 支持度Support: 15

Rule: 买了milk，又买bananas
 - 置信度Confidence: 0.435
 - 支持度Support: 20

Rule: 买了cheese，又买bananas
 - 置信度Confidence: 0.806
 - 支持度Support: 29

Rule: 买了cheese，又买bread
 - 置信度Confidence: 0.111
 - 支持度Support: 4

Rule: 买了cheese，又买apples
 - 置信度Confidence: 0.528
 - 支持度Support: 19

Rule: 买了cheese，又买milk
 - 置信度Confidence: 0.222
 - 支持度Support: 8

Rule: 买了bananas，又买apples
 - 置信度Confidence: 0.393
 - 支持度Support: 24

Rule: 买了bread，又买bananas
 - 置信度Confidence: 0.488
 - 支持度Support: 21

Rule: 买了ba

In [6]:
# 封装一下方便调用
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: 买了{0}，又买{1}".format(premise_name, conclusion_name))
    print(" - 置信度Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - 支持度Support: {0}".format(support[(premise, conclusion)]))
    print("")
    
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: 买了milk，又买apples
 - 置信度Confidence: 0.326
 - 支持度Support: 15



In [7]:
# 按支持度support排序
from pprint import pprint
pprint(list(support.items()))

[((1, 2), 8),
 ((0, 1), 23),
 ((3, 2), 19),
 ((1, 3), 15),
 ((3, 0), 11),
 ((4, 1), 20),
 ((3, 1), 15),
 ((1, 4), 20),
 ((0, 2), 4),
 ((2, 0), 4),
 ((2, 3), 19),
 ((2, 1), 8),
 ((4, 3), 24),
 ((0, 4), 21),
 ((1, 0), 23),
 ((4, 2), 29),
 ((0, 3), 11),
 ((3, 4), 24),
 ((2, 4), 29),
 ((4, 0), 21)]


In [8]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5): # 打印前5个
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: 买了cheese，又买bananas
 - 置信度Confidence: 0.806
 - 支持度Support: 29

Rule #2
Rule: 买了apples，又买bananas
 - 置信度Confidence: 0.615
 - 支持度Support: 24

Rule #3
Rule: 买了bread，又买milk
 - 置信度Confidence: 0.535
 - 支持度Support: 23

Rule #4
Rule: 买了cheese，又买apples
 - 置信度Confidence: 0.528
 - 支持度Support: 19

Rule #5
Rule: 买了milk，又买bread
 - 置信度Confidence: 0.500
 - 支持度Support: 23



# 任务2：Iris植物分类
- 给出某一植物部分特征，预测该植物是什么

特征：
- 萼片长宽sepal width, sepal height
- 花瓣长宽petal width, petal height

算法：
* For each variable
    * For each value of the variable
        * The prediction based on this variable goes the most frequent class
        * Compute the error of this prediction
    * Sum the prediction errors for all values of the variable
* Use the variable with the lowest error

In [9]:
from sklearn.datasets import load_iris
#X, y = np.loadtxt("X_classification.txt"), np.loadtxt("y_classification.txt")
dataset = load_iris()
X = dataset.data
y = dataset.target
print(dataset.DESCR)
n_samples, n_features = X.shape

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [10]:
# Compute the mean for each attribute计算平均值
attribute_means = X.mean(axis=0)
assert attribute_means.shape == (n_features,)
X_d = np.array(X >= attribute_means, dtype='int')

In [11]:
# 划分训练集和测试集
from sklearn.cross_validation import train_test_split

# Set the random state to the same number to get the same results as in the book
random_state = 14

X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("训练集数据有 {} 条".format(y_train.shape))
print("测试集数据有 {} 条".format(y_test.shape))

训练集数据有 (112,) 条
测试集数据有 (38,) 条




In [12]:
from collections import defaultdict
from operator import itemgetter


def train(X, y_true, feature):
    """Computes the predictors and error for a given feature using the OneR algorithm
    
    Parameters
    ----------
    X: array [n_samples, n_features]
        The two dimensional array that holds the dataset. Each row is a sample, each column
        is a feature.
    
    y_true: array [n_samples,]
        The one dimensional array that holds the class values. Corresponds to X, such that
        y_true[i] is the class value for sample X[i].
    
    feature: int
        An integer corresponding to the index of the variable we wish to test.
        0 <= variable < n_features
        
    Returns
    -------
    predictors: dictionary of tuples: (value, prediction)
        For each item in the array, if the variable has a given value, make the given prediction.
    
    error: float
        The ratio of training data that this rule incorrectly predicts.
    """
    # 1.一些等下要用的变量（数据的形状如上）
    n_samples, n_features = X.shape
    assert 0 <= feature < n_features
    values = set(X[:,feature])
    predictors = dict()
    errors = []
    
    # 2.算法（对照上面的算法流程）
    for current_value in values: 
    # For each value of the variable
    
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value) 
        # The prediction based on this variable goes the most frequent class
        
        predictors[current_value] = most_frequent_class
        errors.append(error)
        # Compute the error of this prediction
    
    total_error = sum(errors)
    # Sum the prediction errors for all values of the variable
    
    return predictors, total_error

# Compute what our predictors say each sample is based on its value
#y_predicted = np.array([predictors[sample[feature]] for sample in X])
    

def train_feature_value(X, y_true, feature, value):
    # Create a simple dictionary to count how frequency they give certain predictions
    class_counts = defaultdict(int)
    # Iterate through each sample and count the frequency of each class/value pair
    for sample, y in zip(X, y_true):
        if sample[feature] == value:
            class_counts[y] += 1
    # Now get the best one by sorting (highest first) and choosing the first item
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    # The error is the number of samples that do not classify as the most frequent class
    # *and* have the feature value.
    n_samples = X.shape[1]
    error = sum([class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class])
    return most_frequent_class, error

In [13]:
# Compute all of the predictors计算所有预测值
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
# Now choose the best and save that as "model"
# Sort by error按误差排序
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))

# Choose the bset model选最好的模型，也就是误差最小的模型
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
print(model)

The best model is based on variable 2 and has error 37.00
{'variable': 2, 'predictor': {0: 0, 1: 2}}


In [14]:
def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted

In [15]:
y_predicted = predict(X_test, model)
print(y_predicted)
accuracy = np.mean(y_predicted == y_test) * 100
print("在测试集上的准确率 {:.1f}%".format(accuracy))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))

[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
 2]
在测试集上的准确率 65.8%
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       0.00      0.00      0.00        13
          2       0.40      1.00      0.57         8

avg / total       0.51      0.66      0.55        38



  'precision', 'predicted', average, warn_for)
