In [1]:
from __future__ import division
from math import exp
from numpy import *
import numpy as np
from random import normalvariate
import pandas as pd
# reference
# https://segmentfault.com/a/1190000020254554
# https://blog.csdn.net/john_xyz/article/details/78933253

In [2]:
#与FM相比，修改读入数据，xgb输出后的one-hot；数据预处理去掉归一化
train_data = 'fm_data/xgb_train.txt'
test_data = 'fm_data/xgb_test.txt'

def preprocess(data):
    feature=np.array(data.iloc[:,:-1])
    label=data.iloc[:,-1].map(lambda x: 1 if x==1 else -1)
    label=np.array(label)
    return feature,label

def sigmoid(x):
    return 1.0 / (1 + exp(-x))

In [3]:
def SGD_FM(data, label, k, iter):
    m, num_feature = shape(data)
    alpha = 0.01
    w = zeros((num_feature, 1))#一阶特征的系数
    w_0 = 0.0
    v = normalvariate(0, 0.2) * ones((num_feature, k))   #即生成辅助向量，用来训练二阶交叉特征的系数

    for it in range(iter):
        true_loss = 0.0
        false_loss = 0.0
        for x in range(m):
            fm_1 = data[x] * v
            fm_2 = multiply(data[x], data[x]) * multiply(v, v)
            fm = sum(multiply(fm_1, fm_1) - fm_2) / 2.
            y = w_0 + data[x] * w + fm  # FM预测输出
            loss = sigmoid(label[x] * y[0, 0])-1    #真实损失过大，与真实损失相像。loss针对当前样本，不做为最后loss衡量
            true_loss += -np.log(sigmoid(label[x] * y[0, 0]))
            false_loss += -loss
            w_0 -= alpha * loss * label[x]
            for i in range(num_feature):
                if data[x, i] != 0:
                    w[i, 0] -= alpha * loss * label[x] * data[x, i]
                    for j in range(k):
                        v[i, j] -= alpha * loss * label[x] * (data[x, i] * fm_1[0, j] - v[i, j] * data[x, i] * data[x, i])
        print("第{}次迭代后真损失为{}，假损失为{}".format(it, true_loss, false_loss))
    return w_0, w, v
def predict(data, label, w_0, w, v):
    m, _ = shape(data)
    err = 0
    num = 0
    result = []
    for x in range(m):   #计算每一个样本的误差
        num += 1
        fm_1 = data[x] * v
        fm_2 = multiply(data[x], data[x]) * multiply(v, v)
        fm = sum(multiply(fm_1, fm_1) - fm_2) / 2.
        y = w_0 + data[x] * w + fm  # 计算预测的输出
        prop = sigmoid(y[0, 0])
        result.append(prop)
        if prop < 0.5 and label[x] == 1.0:
            err += 1
        elif prop >= 0.5 and label[x] == -1.0:
            err += 1
        else:
            continue
    return float(err) / num

In [4]:
train = pd.read_csv(train_data)
test = pd.read_csv(test_data)
x_train, y_train = preprocess(train)
x_test, y_test = preprocess(test)
w_0, w, v = SGD_FM(mat(x_train), y_train, 20, 200)
print("训练集acc：%f" % (1 - predict(mat(x_train), y_train, w_0, w, v)))
print("测试集acc：%f" % (1 - predict(mat(x_test), y_test, w_0, w, v)))

第0次迭代后真损失为5639.592912342595，假损失为207.30214221716773
第1次迭代后真损失为946.2647251136234，假损失为111.00812591365998
第2次迭代后真损失为430.5663029537502，假损失为75.61740640936961
第3次迭代后真损失为235.06152972570186，假损失为57.06018562934807
第4次迭代后真损失为141.06702125553775，假损失为46.20783923391546
第5次迭代后真损失为90.38387129754874，假损失为38.02062275232458
第6次迭代后真损失为63.734571376230754，假损失为31.39637066264236
第7次迭代后真损失为49.91504996862862，假损失为26.604638399710193
第8次迭代后真损失为41.26084465352702，假损失为23.3704222809332
第9次迭代后真损失为34.9539089968962，假损失为20.95802601828652
第10次迭代后真损失为30.039643630515943，假损失为19.0114003281696
第11次迭代后真损失为26.064739605832443，假损失为17.3779979273766
第12次迭代后真损失为22.75140849542275，假损失为15.97938045003641
第13次迭代后真损失为19.925441332863837，假损失为14.749613954530298
第14次迭代后真损失为17.49104360234007，假损失为13.624052033193838
第15次迭代后真损失为15.408643279569597，假损失为12.559958676799196
第16次迭代后真损失为13.664844378040945，假损失为11.55602020799293
第17次迭代后真损失为12.234669818487351，假损失为10.638342121933174
第18次迭代后真损失为11.067840596731354，假损失为9.824745617558564
第19次迭代后真损失为10.10755510015412