In [None]:
#感知机实现
import numpy as np
X=np.array([[1, 1], [3, 3], [4, 3]])
y=[-1,1,1]
class Perceptron(object):
    def __init__(self,learninig_rate=1):
        self.w=np.array([0,0]).reshape((-1,1))
        self.b=0
    def sign(self,x):
        return -1 if x <0 else +1
    def calculate(self,X):
        yH=np.matmul(X,self.w)+self.b
        return np.apply_along_axis(self.sign,1,yH)
    def get_wrong(self,X,yH,Y):
        for x,yh,y in zip(X,yH,Y):
            if yh!=y:
                return {'x':x,'y':y}
        return None
    def fit(self,X,y):
        while True:
            yH=self.calculate(X)
            wrong=self.get_wrong(X,yH,y)
            print(wrong)
            if not wrong:
                break
            self.w = self.w + (wrong['x'] * wrong['y']).reshape((-1, 1))
            self.b = self.b + wrong['y']
            print(f"update w to {self.w} update b to {self.b}")
per=Perceptron()
per.fit(X,y)

In [None]:
#KNN源码实现
'''
k近邻（kNN）算法的工作机制比较简单，根据某种距离测度找出距离给定待测样本距离最小的k个训练样本，根据k个训练样本进行预测。
分类问题：k个点中出现频率最高的类别作为待测样本的类别
回归问题：通常以k个训练样本的平均值作为待测样本的预测值
kNN模型三要素：距离测度、k值的选择、分类或回归决策方式
'''
import numpy as np
class KNNClassfier(object):

    def __init__(self, k=5, distance='euc'):
        self.k = k
        self.distance = distance
        self.x = None
        self.y = None
    def fit(self,X, Y):
        '''
        X : array-like [n_samples,shape]
        Y : array-like [n_samples,1]
        '''        
        self.x = X
        self.y = Y
    def predict(self,X_test):
        '''
        X_test : array-like [n_samples,shape]
        Y_test : array-like [n_samples,1]
        output : array-like [n_samples,1]
        '''  
        output = np.zeros((X_test.shape[0],1))
        for i in range(X_test.shape[0]):
            dis = [] 
            for j in range(self.x.shape[0]):
                if self.distance == 'euc': # 欧式距离
                    dis.append(np.linalg.norm(X_test[i]-self.x[j,:]))
            labels = []
            index=sorted(range(len(dis)), key=dis.__getitem__)
            for j in range(self.k):
                labels.append(self.y[index[j]])
            counts = []
            for label in labels:
                counts.append(labels.count(label))
            output[i] = labels[np.argmax(counts)]
        return output
    def score(self,x,y):
        pred = self.predict(x)
        err = 0.0
        for i in range(x.shape[0]):
            if pred[i]!=y[i]:
                err = err+1
        return 1-float(err/x.shape[0])


if __name__ == '__main__':
    from sklearn import datasets
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target
    # x = np.array([[0.5,0.4],[0.1,0.2],[0.7,0.8],[0.2,0.1],[0.4,0.6],[0.9,0.9],[1,1]]).reshape(-1,2)
    # y = np.array([0,1,0,1,0,1,1]).reshape(-1,1)
    clf = KNNClassfier(k=3)
    clf.fit(x,y)
    print('myknn score:',clf.score(x,y))
    from sklearn.neighbors import KNeighborsClassifier
    clf_sklearn = KNeighborsClassifier(n_neighbors=3)
    clf_sklearn.fit(x,y)
    print('sklearn score:',clf_sklearn.score(x,y))

In [None]:
#KNN sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection  import cross_val_score
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

#读取鸢尾花数据集
iris = load_iris()
x = iris.data
y = iris.target
k_range = range(1, 31)
k_error = []
#循环，取k=1到k=31，查看误差效果
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    #cv参数决定数据集划分比例，这里是按照5:1划分训练集和测试集
    scores = cross_val_score(knn, x, y, cv=6, scoring='accuracy')
    k_error.append(1 - scores.mean())

#画图，x轴为k值，y值为误差值
plt.plot(k_range, k_error)
plt.xlabel('Value of K for KNN')
plt.ylabel('Error')
plt.show()

In [None]:
#KNN画图
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

n_neighbors = 11

# 导入一些要玩的数据
iris = datasets.load_iris()
x = iris.data[:, :2]  # 我们只采用前两个feature,方便画图在二维平面显示
y = iris.target


h = .02  # 网格中的步长

# 创建彩色的图
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])


#weights是KNN模型中的一个参数，上述参数介绍中有介绍，这里绘制两种权重参数下KNN的效果图
for weights in ['uniform', 'distance']:
    # 创建了一个knn分类器的实例，并拟合数据。
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(x, y)

    # 绘制决策边界。为此，我们将为每个分配一个颜色
    # 来绘制网格中的点 [x_min, x_max]x[y_min, y_max].
    x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1
    y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # 将结果放入一个彩色图中
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # 绘制训练点
    plt.scatter(x[:, 0], x[:, 1], c=y, cmap=cmap_bold)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))

plt.show()

In [None]:
#KDTree
#!/usr/bin/python
# -*-coding:utf-8-*-

from collections import namedtuple
from operator import itemgetter
from pprint import pformat

# 节点类,（namedtuple）Node中包含样本点和左右叶子节点
class Node(namedtuple('Node', 'location left_child right_child')):
    def __repr__(self):
        return pformat(tuple(self))

# 构造kd树
def kdtree(point_list, depth=0):
    try:
        # 假设所有点都具有相同的维度
        k = len(point_list[0])
    # 如果不是point_list返回None
    except IndexError as e:
        return None
    # 根据深度选择轴，以便轴循环所有有效值
    axis = depth % k

    # 排序点列表并选择中位数作为主元素
    point_list.sort(key=itemgetter(axis))
    # 向下取整
    median = len(point_list) // 2

    # 创建节点并构建子树
    return Node(
        location=point_list[median],
        left_child=kdtree(point_list[:median], depth + 1),
        right_child=kdtree(point_list[median + 1:], depth + 1))

def main():
    point_list = [(2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2)]
    tree = kdtree(point_list)
    print(tree)

if __name__ == '__main__':
    main()

In [None]:
#KD树sklearn
#!/usr/bin/python
# -*-coding:utf-8-*-

import numpy as np
from sklearn.neighbors import KDTree

np.random.seed(0)
X = np.array([(2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2)])
tree = KDTree(X, leaf_size=2)
# ind：最近的3个邻居的索引
# dist：距离最近的3个邻居
# [X[2]]:搜索点
dist, ind = tree.query([X[2]], k=3)

print('ind:',ind)
print('dist:',dist)

In [None]:
#NB
# Example of Naive Bayes implemented from Scratch in Python
import csv
import random
import math

def loadCsv(filename):
    lines = csv.reader(open(filename, "rb"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.iteritems():
        summaries[classValue] = summarize(instances)
    return summaries

def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.iteritems():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities
            
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.iteritems():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

def main():
    filename = 'pima-indians-diabetes.data.csv'
    splitRatio = 0.67
    dataset = loadCsv(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
    # prepare model
    summaries = summarizeByClass(trainingSet)
    # test model
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: {0}%').format(accuracy)

main()

In [None]:
#sklearnNB
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 读取数据
X = []
Y = []
fr = open("datingTestSet.txt")
index = 0
for line in fr.readlines():
    line = line.strip()
    line = line.split('\t')
    X.append(line[:3])
    Y.append(line[-1])

#归一化
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# 交叉分类
train_X,test_X, train_y, test_y = train_test_split(X,
                                                   Y,
                                                   test_size=0.2) # test_size:测试集比例20%

# KNN模型，选择3个邻居
model = GaussianNB()

model.fit(train_X, train_y)
print(model)

expected = test_y
predicted = model.predict(test_X)
print(metrics.classification_report(expected, predicted))       # 输出分类信息
label = list(set(Y))    # 去重复，得到标签类别
print(metrics.confusion_matrix(expected, predicted, labels=label))  # 输出混淆矩阵信息
#共有三种
#多项式：做平滑，同统计学习方法中
#高斯贝叶斯：适合连续特征（正态分布计算条件概率）
#伯努利：适合离散特征，都是在计算条件概率时具有差异
