In [3]:
'''
要算λ，解析解肯定是行不通的;
对于最大熵模型对应的最优化问题，GIS，lbfgs，sgd等最优化算法都能解;
相比之下，GIS大概是最好实现的。
'''

'''数据为：
Outdoor Sunny Happy
Outdoor Sunny Happy Dry
Outdoor Sunny Happy Humid
Outdoor Sunny Sad Dry
Outdoor Sunny Sad Humid
Outdoor Cloudy Happy Humid
Outdoor Cloudy Happy Humid
Outdoor Cloudy Sad Humid
Outdoor Cloudy Sad Humid
Indoor Rainy Happy Humid
Indoor Rainy Happy Dry
Indoor Rainy Sad Dry
Indoor Rainy Sad Humid
Indoor Cloudy Sad Humid
Indoor Cloudy Sad Humid
'''


from collections import defaultdict
import math


# 1,定义模型：GIS优化算法
class MaxEnt(object):
    def __init__(self):
        self.feats = defaultdict(int)
        self.trainset = []
        self.labels = set()  
      
    def load_data(self,file):
        for line in open(file):
            fields = line.strip().split()
            # at least two columns
            if len(fields) < 2: continue
            # the first column is label
            label = fields[0]
            self.labels.add(label)
            for f in set(fields[1:]):
                # (label,f) tuple is feature 
                self.feats[(label,f)] += 1
            self.trainset.append(fields)
            
    def _initparams(self):
        self.size = len(self.trainset)
        # M param for GIS training algorithm
        self.M = max([len(record)-1 for record in self.trainset])
        self.ep_ = [0.0]*len(self.feats)
        for i,f in enumerate(self.feats):
            # calculate feature expectation on empirical distribution
            self.ep_[i] = float(self.feats[f])/float(self.size)
            # each feature function correspond to id
            self.feats[f] = i
        # init weight for each feature
        self.w = [0.0]*len(self.feats)
        self.lastw = self.w
        
    def probwgt(self,features,label):
        wgt = 0.0
        for f in features:
            if (label,f) in self.feats:
                wgt += self.w[self.feats[(label,f)]]
        return math.exp(wgt)
            
    """
    calculate feature expectation on model distribution
    """        
    def Ep(self):
        ep = [0.0]*len(self.feats)
        for record in self.trainset:
            features = record[1:]
            # calculate p(y|x)
            prob = self.calprob(features)
            for f in features:
                for w,l in prob:
                    # only focus on features from training data.
                    if (l,f) in self.feats:
                        # get feature id
                        idx = self.feats[(l,f)]
                        # sum(1/N * f(y,x)*p(y|x)), p(x) = 1/N
                        ep[idx] += w * (1.0/self.size)
        return ep
    
    def _convergence(self,lastw,w):
        for w1,w2 in zip(lastw,w):
            if abs(w1-w2) >= 0.01:
                return False
        return True
                
    def train(self, max_iter =1000):
        self._initparams()
        for i in range(max_iter):
            print('iter %d ...'%(i+1))
            # calculate feature expectation on model distribution
            self.ep = self.Ep()           
            self.lastw = self.w[:]  
            for i,w in enumerate(self.w):
                delta = 1.0/self.M * math.log(self.ep_[i]/self.ep[i])
                # update w
                self.w[i] += delta
            print(self.w)
            # test if the algorithm is convergence
            if self._convergence(self.lastw,self.w):
                break
    
    def calprob(self,features):
        wgts = [(self.probwgt(features, l),l) for l in self.labels]
        Z = sum([ w for w,l in wgts])
        prob = [ (w/Z,l) for w,l in wgts]
        return prob 
            
    def predict(self,input):
        features = input.strip().split()
        prob = self.calprob(features)
        prob.sort(reverse=True)
        return prob   

In [4]:
# 2，训练模型：
model = MaxEnt()

model.load_data('../DataSets/gameLocation.dat')

model.train()

iter 1 ...
[0.11889164797957746, 0.23104906018664842, 0.0, 0.060773852264651596, 0.0, 0.09589402415059367, 0.23104906018664842, -0.07438118377140324, -0.18653859597847416, 0.0, 0.0, -0.13515503603605475]
iter 2 ...
[0.16774470869242056, 0.39342926385722005, -0.024041727666437477, 0.06263725548769755, -0.033941158312750054, 0.12315355920031779, 0.4603225478049182, -0.07715690147389087, -0.2842974253623657, 0.025911470552945935, 0.037793085198623795, -0.18377904505135018]
iter 3 ...
[0.19870920683265628, 0.5321697579570994, -0.04037675421991352, 0.05879421172232712, -0.05906896095953148, 0.1409492415240725, 0.6565728946531962, -0.07130796120852223, -0.3510652882167372, 0.043088432656839426, 0.06497067687314947, -0.21675116773525774]
iter 4 ...
[0.22051022594726818, 0.6548790089328552, -0.05027585410260586, 0.05397747303411313, -0.07620648567492985, 0.15485457212869508, 0.8265378848816127, -0.06394977866543088, -0.40004267434730967, 0.053290530237544514, 0.08303726721459531, -0.2429339673

In [5]:
model.predict('Sunny')

[(0.9763203118841158, 'Outdoor'), (0.02367968811588421, 'Indoor')]

In [6]:
model.predict('Cloudy')

[(0.7136730549489295, 'Outdoor'), (0.28632694505107054, 'Indoor')]