In [1]:
import math
from copy import deepcopy

In [67]:
class MaxEntrop:
    #数据预处理
    def __init__(self, EPS = 0.005):
        self.samples =[]
        self._Y = set()
        self._numXY = {} #存放f(x, y)的存放次数
        self._N = 0 #样本数目
        self._Ep_ = []
        self._xyID = {}#记录id编号
        self._n = 0 #特征值（x, y）的个数
        self.C = 0 #z最大特征数目
        self._IDxy = {}
        self._w = []
        self._EPS = EPS #收敛条件
        self._lastw = [] 
        
    def loadData(self, dataset):
        
        self._samples = deepcopy(dataset)
        #print(self._samples)
        for items in self._samples:
            y = items[0]
            X = items[1:]
            #print(y, X)
            self._Y.add(y)  # 集合中y若已存在则会自动忽略
            for x in X:
                if (x, y) in self._numXY:
                    self._numXY[(x, y)] += 1
                else:
                    self._numXY[(x, y)] = 1
        
        #print("self._numXY", self._numXY)
        self._N = len(self._samples)
        self._n = len(self._numXY)
        self._C = max([len(sample) - 1 for sample in self._samples])
        print("self._c", self._C)
        self._w = [0] * self._n
        self._lastw = self._w[:]

        self._Ep_ = [0] * self._n
        for i, xy in enumerate(self._numXY):  # 计算特征函数fi关于经验分布的期望
            self._Ep_[i] = self._numXY[xy] / self._N #计算经验分布函数
            self._xyID[xy] = i
            self._IDxy[i] = xy
            
    def _Zx(self, X):
        zx = 0
        
        for y in self._Y:
            ss = 0
            for x in X:
#                 print("-------------------")
#                 print("x = ", x)
                if (x, y) in self._numXY:
                    ss += self._w[self._xyID[(x, y)]] #保证权值和特征函数一一对应
            zx += math.exp(ss)
        return zx
        
    def _model_pyx(self, y, X):
        zx =  self._Zx(X)
        ss = 0
        for x in X:
            if (x, y) in self._numXY:
                ss += self._w[self._xyID[(x, y)]]
        pyx = math.exp(ss) / zx
        
        return pyx
    
    def _model_ep(self, index):  # 计算特征函数fi关于模型的期望
        x, y = self._IDxy[index]
        ep = 0
        for sample in self._samples:
            #print("sample : ", sample)
            if x not in sample:
                continue
            pyx = self._model_pyx(y, sample)
            ep += pyx / self._N
        return ep
    
    def _convergence(self):#判断模型是否收敛
        for last, now in zip(self._lastw, self._w):
            if abs(last - now) >= self._EPS:
                return False
        return True
    
    def predict(self, X):
        Z = self._Zx(X)
        result = {}
        
        for y in self._Y:
            ss = 0 
            for x in X:
                if (x, y) in self._numXY:
                    ss += self._w[self._xyID[(x, y)]]
            pyx = math.exp(ss)/Z
            result[y] = pyx
        
        return result
    
    def train(self, maxiter = 1000):
        for loop in range(maxiter):
            
            self._lastw = self._w[:]
            
            #改进的迭代尺度算法（IIS）
            for i in range(self._n):
                ep = self._model_ep(i)
                self._w[i] += math.log(self._Ep_[i]/ep)/self._C
            #print("w:",self._w)
            if self._convergence():
                break
         

In [68]:
dataset = [['no', 'sunny', 'hot', 'high', 'FALSE'],
           ['no', 'sunny', 'hot', 'high', 'TRUE'],
           ['yes', 'overcast', 'hot', 'high', 'FALSE'],
           ['yes', 'rainy', 'mild', 'high', 'FALSE'],
           ['yes', 'rainy', 'cool', 'normal', 'FALSE'],
           ['no', 'rainy', 'cool', 'normal', 'TRUE'],
           ['yes', 'overcast', 'cool', 'normal', 'TRUE'],
           ['no', 'sunny', 'mild', 'high', 'FALSE'],
           ['yes', 'sunny', 'cool', 'normal', 'FALSE'],
           ['yes', 'rainy', 'mild', 'normal', 'FALSE'],
           ['yes', 'sunny', 'mild', 'normal', 'TRUE'],
           ['yes', 'overcast', 'mild', 'high', 'TRUE'],
           ['yes', 'overcast', 'hot', 'normal', 'FALSE'],
           ['no', 'rainy', 'mild', 'high', 'TRUE']]

In [69]:
maxent = MaxEntrop()
x = ['overcast', 'mild', 'high', 'FALSE']

In [70]:
maxent.loadData(dataset)
maxent.train(1000)


self._c 4


In [61]:
print("准确率：%f"%(maxent.predict(x)["yes"]*100))
print("w",maxent._w)

准确率：99.999718
w [3.8083642640626567, 0.03486819339596017, 1.6400224976589863, -4.463151671894514, 1.7883062251202593, 5.3085267683086395, -0.13398764643967703, -2.2539799445450392, 1.484078418970969, -1.8909065913678864, 1.9332493167387288, -1.262945447606903, 1.725751941905932, 2.967849703391228, 3.9061632698216293, -9.520241584621717, -1.8736788731126408, -3.4838446608661995, -5.637874599559358]
