In [1]:
import xgboost as xgb
from collections import namedtuple
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

train_data = 'fm_data/train.txt'
test_data = 'fm_data/test.txt'

In [2]:
def show_accuracy(y_hat, y_t, tip):
    for i in range(len(y_hat)):
        if(y_hat[i]>=0.5):
            y_hat[i] = 1
        else:
            y_hat[i] = 0
    acc = y_hat.ravel() == y_t.ravel()
    print (acc)
    print (tip + '正确率：\t', float(acc.sum()) / y_hat.size)

def preprocess(data):
    feature=np.array(data.iloc[:,:-1])
    label=data.iloc[:,-1]
    label=np.array(label)
    print("未归一化处理数据：",feature[0],',',label[0])
    return feature,label

In [3]:
train = pd.read_csv(train_data,header=None)
test = pd.read_csv(test_data,header=None)
x_train, y_train = preprocess(train)
x_test, y_test = preprocess(test)
data_train = xgb.DMatrix(x_train,label=y_train)
data_test = xgb.DMatrix(x_test,label=y_test)

未归一化处理数据： [ 1.    85.    66.    29.     0.    26.6    0.351 31.   ] , 0
未归一化处理数据： [ 3.    84.    72.    32.     0.    37.2    0.267 28.   ] , 0


In [4]:
print("start")
param = {'max_depth': 7, 'eta': 0.3, 'silent': 1, 'objective': 'binary:logistic'}
watchlist = [(data_test, 'eval'), (data_train, 'train')]
bst = xgb.train(param, data_train, num_boost_round=10, evals=watchlist)
y_hat = bst.predict(data_test)
show_accuracy(y_hat, y_test, 'XGBoost ')

start
[0]	eval-error:0.254682	train-error:0.164329
[1]	eval-error:0.213483	train-error:0.164329
[2]	eval-error:0.2397	train-error:0.114228
[3]	eval-error:0.243446	train-error:0.088176
[4]	eval-error:0.2397	train-error:0.076152
[5]	eval-error:0.228464	train-error:0.064128
[6]	eval-error:0.23221	train-error:0.052104
[7]	eval-error:0.220974	train-error:0.048096
[8]	eval-error:0.228464	train-error:0.04008
[9]	eval-error:0.209738	train-error:0.036072
[ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True False
  True  True  True  True False False  True  True  True  True  True False
 False  True  True False  True  True  True  True False False  True False
  True  True  True  True  True  True  True False False  True  True  True
  True  True  True  True False False  True False  True  True  True  True
  True False  True  True  True  True  T

保存样本的叶子节点

In [5]:
num_leaf = 64

print('Writing transformed training data')
leaf = bst.predict(data_train,pred_leaf=True)
print(np.array(leaf).shape)
print(leaf[0])
transformed_training_matrix = np.zeros([len(leaf), len(leaf[0]) * num_leaf],dtype=np.int64) # N * num_tress * num_leafs
for i in range(0, len(leaf)):
    temp = np.arange(len(leaf[0])) * num_leaf + np.array(leaf[i])
    transformed_training_matrix[i][temp] += 1
transformed_training_matrix[0],len(transformed_training_matrix[0])

Writing transformed training data
(499, 10)
[45 19 54 41 28 39 40 46 31 19]


(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [6]:
print('Writing transformed training data and label')
transformed_training_data = np.zeros([len(leaf), len(leaf[0]) * num_leaf + 1],dtype=np.int64) # N * num_tress * num_leafs  +1 末列为label
for i in range(0, len(leaf)):
    temp = np.arange(len(leaf[0])) * num_leaf + np.array(leaf[i])
    transformed_training_data[i][temp] += 1
    transformed_training_data[i][-1] = y_train[i]
transformed_training_data[0],len(transformed_training_data[0])

Writing transformed training data and label


(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
print('Writing transformed testing data')
leaf = bst.predict(data_test, pred_leaf=True)
print(np.array(leaf).shape)
print(leaf[0])
transformed_testing_matrix = np.zeros([len(leaf), len(leaf[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(leaf)):
    temp = np.arange(len(leaf[0])) * num_leaf + np.array(leaf[i])
    transformed_testing_matrix[i][temp] += 1
transformed_testing_matrix[0],len(transformed_testing_matrix[0])

Writing transformed testing data
(267, 10)
[45 19 25 56 28 61 40 44 39 19]


(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
print('Writing transformed testing data and label')
leaf = bst.predict(data_test, pred_leaf=True)
transformed_testing_data = np.zeros([len(leaf), len(leaf[0]) * num_leaf + 1], dtype=np.int64)
for i in range(0, len(leaf)):
    temp = np.arange(len(leaf[0])) * num_leaf + np.array(leaf[i])
    transformed_testing_data[i][temp] += 1
    transformed_testing_data[i][-1] = y_test[i]
transformed_testing_data[0],len(transformed_testing_data[0])

Writing transformed testing data and label


(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
np.savetxt("fm_data/xgb_train.txt",transformed_training_data,fmt="%.1e",delimiter=',')
np.savetxt("fm_data/xgb_test.txt",transformed_testing_data,fmt="%.1e",delimiter=',')