In [1]:
from __future__ import division
from math import exp
from numpy import *
import numpy as np
from random import normalvariate
import pandas as pd
# reference
# https://segmentfault.com/a/1190000020254554
# https://blog.csdn.net/john_xyz/article/details/78933253

In [2]:
#与FM相比，修改读入数据，xgb输出后的one-hot；数据预处理去掉归一化
train_data = 'fm_data/xgb_train.txt'
test_data = 'fm_data/xgb_test.txt'

def preprocess(data):
    feature=np.array(data.iloc[:,:-1])
    label=data.iloc[:,-1].map(lambda x: 1 if x==1 else -1)
    label=np.array(label)
    return feature,label

def sigmoid(x):
    return 1.0 / (1 + exp(-x))

In [3]:
def SGD_FM(data, label, k, iter):
    m, num_feature = shape(data)
    alpha = 0.01
    w = zeros((num_feature, 1))#一阶特征的系数
    w_0 = 0.0
    v = normalvariate(0, 0.2) * ones((num_feature, k))   #即生成辅助向量，用来训练二阶交叉特征的系数

    for it in range(iter):
        true_loss = 0.0
        false_loss = 0.0
        for x in range(m):
            fm_1 = data[x] * v
            fm_2 = multiply(data[x], data[x]) * multiply(v, v)
            fm = sum(multiply(fm_1, fm_1) - fm_2) / 2.
            y = w_0 + data[x] * w + fm  # FM预测输出
            loss = sigmoid(label[x] * y[0, 0])-1    #真实损失过大，与真实损失相像。loss针对当前样本，不做为最后loss衡量
            true_loss += -np.log(sigmoid(label[x] * y[0, 0]))
            false_loss += -loss
            w_0 -= alpha * loss * label[x]
            for i in range(num_feature):
                if data[x, i] != 0:
                    w[i, 0] -= alpha * loss * label[x] * data[x, i]
                    for j in range(k):
                        v[i, j] -= alpha * loss * label[x] * (data[x, i] * fm_1[0, j] - v[i, j] * data[x, i] * data[x, i])
        print("第{}次迭代后真损失为{}，假损失为{}".format(it, true_loss, false_loss))
    return w_0, w, v
def predict(data, label, w_0, w, v):
    m, _ = shape(data)
    err = 0
    num = 0
    result = []
    for x in range(m):   #计算每一个样本的误差
        num += 1
        fm_1 = data[x] * v
        fm_2 = multiply(data[x], data[x]) * multiply(v, v)
        fm = sum(multiply(fm_1, fm_1) - fm_2) / 2.
        y = w_0 + data[x] * w + fm  # 计算预测的输出
        prop = sigmoid(y[0, 0])
        result.append(prop)
        if prop < 0.5 and label[x] == 1.0:
            err += 1
        elif prop >= 0.5 and label[x] == -1.0:
            err += 1
        else:
            continue
    return float(err) / num

In [4]:
train = pd.read_csv(train_data,header=None)
test = pd.read_csv(test_data,header=None)
x_train, y_train = preprocess(train)
x_test, y_test = preprocess(test)
w_0, w, v = SGD_FM(mat(x_train), y_train, 20, 200)
print("xgb+fm 训练集acc：%f" % (1 - predict(mat(x_train), y_train, w_0, w, v)))
print("xgb+fm 测试集acc：%f" % (1 - predict(mat(x_test), y_test, w_0, w, v)))

第0次迭代后真损失为8406.334882724761，假损失为210.6303125906377
第1次迭代后真损失为1428.766441221198，假损失为116.56587913917106
第2次迭代后真损失为654.8476676542087，假损失为79.01315163723822
第3次迭代后真损失为355.8367402548686，假损失为60.40939294130539
第4次迭代后真损失为212.99210928587206，假损失为49.059937795199744
第5次迭代后真损失为132.92017519837785，假损失为40.339982087439715
第6次迭代后真损失为88.71008355202841，假损失为33.6946359800224
第7次迭代后真损失为65.19222372609495，假损失为28.037475174930975
第8次迭代后真损失为51.98021889581952，假损失为24.027077212748292
第9次迭代后真损失为43.049743771928554，假损失为21.2287275380625
第10次迭代后真损失为36.31249563714528，假损失为19.048047736726524
第11次迭代后真损失为30.986794824655934，假损失为17.233395158833943
第12次迭代后真损失为26.63209179522747，假损失为15.678409074341577
第13次迭代后真损失为22.94277552292733，假损失为14.350907654958323
第14次迭代后真损失为19.725672200248404，假损失为13.194093489290044
第15次迭代后真损失为16.90847419819993，假损失为12.120707034853313
第16次迭代后真损失为14.519017666524908，假损失为11.091191876331441
第17次迭代后真损失为12.57743648835995，假损失为10.126373083683001
第18次迭代后真损失为11.029745980921003，假损失为9.247202892623074
第19次迭代后真损失为9.7924067678