In [3]:
'''MNIST数据集:
将这个图作为特征，但需要先经过二值化处理,
还有一点，与其他分类器不同的是，最大熵模型中的f(x,y)f(x,y)中的x是单独的一个特征，不是一个n维特征向量，
而经过二值化处理过的特征都是0与1，因此我们需要对每个维度特征加一个区分标签,
如X=(x0,x1,x2,...)X=(x0,x1,x2,...)变为X=(0_x0,1_x1,2_x2,...)。
Reference:https://github.com/WenDesi/lihang_book_algorithm，主要是把py2转换成py3。
'''



import pandas as pd
import numpy as np

import time
import math
import random

from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# 1，定义模型：
class MaxEnt(object):

    def init_params(self, X, Y):
        self.X_ = X
        self.Y_ = set()

        self.cal_Pxy_Px(X, Y)

        self.N = len(X)                 # 训练集大小
        self.n = len(self.Pxy)          # 书中(x,y)对数
        self.M = 10000.0                # 书91页那个M，但实际操作中并没有用那个值
        # 可认为是学习速率

        self.build_dict()
        self.cal_EPxy()

    def build_dict(self):
        self.id2xy = {}
        self.xy2id = {}

        for i, (x, y) in enumerate(self.Pxy):
            self.id2xy[i] = (x, y)
            self.xy2id[(x, y)] = i

    def cal_Pxy_Px(self, X, Y):
        self.Pxy = defaultdict(int)
        self.Px = defaultdict(int)

        for i in range(len(X)):
            x_, y = X[i], Y[i]
            self.Y_.add(y)

            for x in x_:
                self.Pxy[(x, y)] += 1
                self.Px[x] += 1

    def cal_EPxy(self):
        '''
        计算书中82页最下面那个期望
        '''
        self.EPxy = defaultdict(float)
        for id in range(self.n):
            (x, y) = self.id2xy[id]
            self.EPxy[id] = float(self.Pxy[(x, y)]) / float(self.N)

    def cal_pyx(self, X, y):
        result = 0.0
        for x in X:
            if self.fxy(x, y):
                id = self.xy2id[(x, y)]
                result += self.w[id]
        return (math.exp(result), y)

    def cal_probality(self, X):
        '''
        计算书85页公式6.22
        '''
        Pyxs = [(self.cal_pyx(X, y)) for y in self.Y_]
        Z = sum([prob for prob, y in Pyxs])
        return [(prob / Z, y) for prob, y in Pyxs]

    def cal_EPx(self):
        '''
        计算书83页最上面那个期望
        '''
        self.EPx = [0.0 for i in range(self.n)]

        for i, X in enumerate(self.X_):
            Pyxs = self.cal_probality(X)

            for x in X:
                for Pyx, y in Pyxs:
                    if self.fxy(x, y):
                        id = self.xy2id[(x, y)]

                        self.EPx[id] += Pyx * (1.0 / self.N)

    def fxy(self, x, y):
        return (x, y) in self.xy2id

    def train(self, X, Y):
        self.init_params(X, Y)
        self.w = [0.0 for i in range(self.n)]

        max_iteration = 1000  # 可设置8000次或更多的迭代次数
        for times in range(max_iteration):
            print('iterater times %d' % times)
            sigmas = []
            self.cal_EPx()

            for i in range(self.n):
                sigma = 1 / self.M * math.log(self.EPxy[i] / self.EPx[i])
                sigmas.append(sigma)

            # if len(filter(lambda x: abs(x) >= 0.01, sigmas)) == 0:
            #     break

            self.w = [self.w[i] + sigmas[i] for i in range(self.n)]

    def predict(self, testset):
        results = []
        for test in testset:
            result = self.cal_probality(test)
            results.append(max(result, key=lambda x: x[0])[1])
        return results


def rebuild_features(features):
    '''
    将原feature的（a0,a1,a2,a3,a4,...）
    变成 (0_a0,1_a1,2_a2,3_a3,4_a4,...)形式
    '''
    new_features = []
    for feature in features:
        new_feature = []
        for i, f in enumerate(feature):
            new_feature.append(str(i) + '_' + str(f))
        new_features.append(new_feature)
    return new_features

In [4]:
# 2，训练模型：
print('Start read data')

time_1 = time.time()

raw_data = pd.read_csv('../DataSets/train_binary.csv', header=0)
data = raw_data.values

imgs = data[0::, 1::]
labels = data[::, 0]

# 选取 2/3 数据作为训练集， 1/3 数据作为测试集
train_features, test_features, train_labels, test_labels = train_test_split(
    imgs, labels, test_size=0.33, random_state=23323)

train_features = rebuild_features(train_features)
test_features = rebuild_features(test_features)

time_2 = time.time()
print('read data cost ', time_2 - time_1, ' second', '\n')

print('Start training')
met = MaxEnt()
met.train(train_features, train_labels)

time_3 = time.time()
print('training cost ', time_3 - time_2, ' second', '\n')

print('Start predicting')
test_predict = met.predict(test_features)
time_4 = time.time()
print('predicting cost ', time_4 - time_3, ' second', '\n')

score = accuracy_score(test_labels, test_predict)
print("The accruacy socre is ", score)

Start read data
read data cost  34.22603178024292  second 

Start training
iterater times 0
iterater times 1
iterater times 2
iterater times 3
iterater times 4
iterater times 5
iterater times 6
iterater times 7
iterater times 8
iterater times 9
iterater times 10
iterater times 11
iterater times 12
iterater times 13
iterater times 14
iterater times 15
iterater times 16
iterater times 17
iterater times 18
iterater times 19
iterater times 20
iterater times 21
iterater times 22
iterater times 23
iterater times 24
iterater times 25
iterater times 26
iterater times 27
iterater times 28
iterater times 29
iterater times 30
iterater times 31
iterater times 32
iterater times 33
iterater times 34
iterater times 35
iterater times 36
iterater times 37
iterater times 38
iterater times 39
iterater times 40
iterater times 41
iterater times 42
iterater times 43
iterater times 44
iterater times 45
iterater times 46
iterater times 47
iterater times 48
iterater times 49
iterater times 50
iterater times 51

iterater times 434
iterater times 435
iterater times 436
iterater times 437
iterater times 438
iterater times 439
iterater times 440
iterater times 441
iterater times 442
iterater times 443
iterater times 444
iterater times 445
iterater times 446
iterater times 447
iterater times 448
iterater times 449
iterater times 450
iterater times 451
iterater times 452
iterater times 453
iterater times 454
iterater times 455
iterater times 456
iterater times 457
iterater times 458
iterater times 459
iterater times 460
iterater times 461
iterater times 462
iterater times 463
iterater times 464
iterater times 465
iterater times 466
iterater times 467
iterater times 468
iterater times 469
iterater times 470
iterater times 471
iterater times 472
iterater times 473
iterater times 474
iterater times 475
iterater times 476
iterater times 477
iterater times 478
iterater times 479
iterater times 480
iterater times 481
iterater times 482
iterater times 483
iterater times 484
iterater times 485
iterater tim

iterater times 866
iterater times 867
iterater times 868
iterater times 869
iterater times 870
iterater times 871
iterater times 872
iterater times 873
iterater times 874
iterater times 875
iterater times 876
iterater times 877
iterater times 878
iterater times 879
iterater times 880
iterater times 881
iterater times 882
iterater times 883
iterater times 884
iterater times 885
iterater times 886
iterater times 887
iterater times 888
iterater times 889
iterater times 890
iterater times 891
iterater times 892
iterater times 893
iterater times 894
iterater times 895
iterater times 896
iterater times 897
iterater times 898
iterater times 899
iterater times 900
iterater times 901
iterater times 902
iterater times 903
iterater times 904
iterater times 905
iterater times 906
iterater times 907
iterater times 908
iterater times 909
iterater times 910
iterater times 911
iterater times 912
iterater times 913
iterater times 914
iterater times 915
iterater times 916
iterater times 917
iterater tim