In [3]:
import lightgbm as lgb

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split

In [6]:
# print('Load data...')
# df_train = pd.read_csv('data/train.csv')
# df_test = pd.read_csv('data/test.csv')

# NUMERIC_COLS = [
#     "ps_reg_01", "ps_reg_02", "ps_reg_03",
#     "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
# ]

# print(df_test.head(10))

data = load_svmlight_file('data/train.txt')

X,y=data[0],data[1]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3,random_state=42)

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 64,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# number of leaves,will be used in feature transformation
num_leaf = 64

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)
print("y_pred",y_pred)

print(np.array(y_pred).shape)
print(y_pred[:10])

print('Writing transformed training data')
# transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
#                                        dtype=np.int64)  # N * num_tress * num_leafs

transformed_training_matrix = np.zeros([len(y_pred), num_leaf],
                                       dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_training_matrix[i][temp] += 1


y_pred = gbm.predict(X_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_testing_matrix[i][temp] += 1


lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
lm.fit(transformed_training_matrix,y_train)  # fitting the data
y_pred_test = lm.predict_proba(transformed_testing_matrix)   # Give the probabilty on each label

print(y_pred_test)

NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
print("Normalized Cross Entropy " + str(NE))

Start training...




[1]	training's binary_logloss: 34.5388
[2]	training's binary_logloss: 34.5388
[3]	training's binary_logloss: 34.5388
[4]	training's binary_logloss: 34.5388
[5]	training's binary_logloss: 34.5388
[6]	training's binary_logloss: 34.5388
[7]	training's binary_logloss: 34.5388
[8]	training's binary_logloss: 34.5388
[9]	training's binary_logloss: 34.5388
[10]	training's binary_logloss: 34.5388
[11]	training's binary_logloss: 34.5388
[12]	training's binary_logloss: 34.5388
[13]	training's binary_logloss: 34.5388
[14]	training's binary_logloss: 34.5388
[15]	training's binary_logloss: 34.5388
[16]	training's binary_logloss: 34.5388
[17]	training's binary_logloss: 34.5388
[18]	training's binary_logloss: 34.5388
[19]	training's binary_logloss: 34.5388
[20]	training's binary_logloss: 34.5388
[21]	training's binary_logloss: 34.5388
[22]	training's binary_logloss: 34.5388
[23]	training's binary_logloss: 34.5388
[24]	training's binary_logloss: 34.5388
[25]	training's binary_logloss: 34.5388
[26]	trai

TypeError: object of type 'numpy.int32' has no len()

In [5]:
y_train.shape

(314581,)