In [54]:
import lightgbm as lgb

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error, precision_score,confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.preprocessing import PolynomialFeatures


In [34]:
data = load_svmlight_file('data/demo-g1.txt')

X,y=data[0],data[1]

X_s,y_s=resample(X,y,n_samples=50000,replace=False)
X_train,X_test,y_train,y_test=train_test_split(X_s,y_s,test_size=.3,random_state=42)

In [51]:
lr = LogisticRegression(C=10,max_iter=1000)
lr.fit(X_train,y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [53]:
y_pred_test=lr.predict(X_test)
print(classification_report(y_test,y_pred_test))
print(confusion_matrix(y_test,y_pred_test).T)

              precision    recall  f1-score   support

         1.0       0.66      0.85      0.74      8587
         2.0       0.68      0.41      0.51      6413

   micro avg       0.66      0.66      0.66     15000
   macro avg       0.67      0.63      0.63     15000
weighted avg       0.67      0.66      0.64     15000

[[7318 3777]
 [1269 2636]]


In [56]:
rf_en=RandomTreesEmbedding(300,n_jobs=-1)
X_train_em=rf_en.fit_transform(X_train)

In [None]:
poly = PolynomialFeatures()
X_train_poly=poly.fit_transform(X_train_em)
X_test_poly = poly.transform(X_test)
lr.fit(X_train_poly,y_train)
y_pred_test_poly=lr.predict(X_test_poly)
print(classification_report(y_test,y_pred_test_poly))
print(confusion_matrix(y_test,y_pred_test_poly).T)

In [49]:
lr_em = LogisticRegression(C=10,max_iter=1000)
lr_em.fit(X_train_em,y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [52]:
y_pred_train=lr.predict(X_train)
print(classification_report(y_train,y_pred_train))
print(confusion_matrix(y_train,y_pred_train).T)

              precision    recall  f1-score   support

         1.0       0.68      0.88      0.77     19705
         2.0       0.75      0.46      0.57     15295

   micro avg       0.70      0.70      0.70     35000
   macro avg       0.71      0.67      0.67     35000
weighted avg       0.71      0.70      0.68     35000

[[17373  8277]
 [ 2332  7018]]


In [50]:
y_pred_train_em=lr_em.predict(X_train_em)
print(classification_report(y_train,y_pred_train_em))
print(confusion_matrix(y_train,y_pred_train_em).T)

              precision    recall  f1-score   support

         1.0       0.60      0.97      0.75     19705
         2.0       0.84      0.18      0.30     15295

   micro avg       0.63      0.63      0.63     35000
   macro avg       0.72      0.58      0.52     35000
weighted avg       0.71      0.63      0.55     35000

[[19184 12543]
 [  521  2752]]


In [42]:
X_test_em=rf_en.transform(X_test)
y_pred_test_em=lr_em.predict(X_test_em)
print(classification_report(y_test,y_pred_test_em))
print(confusion_matrix(y_test,y_pred_test_em).T)

              precision    recall  f1-score   support

         1.0       0.59      0.95      0.73      8587
         2.0       0.64      0.12      0.20      6413

   micro avg       0.59      0.59      0.59     15000
   macro avg       0.62      0.53      0.46     15000
weighted avg       0.61      0.59      0.50     15000

[[8157 5649]
 [ 430  764]]


In [43]:
X_test.shape, X_test_em.shape

((15000, 18338), (15000, 9531))

In [None]:
X_test_imp_en=rf_en_new.transform(X_test_imp)
y_pred_test_imp_en=lr_en.predict(X_test_imp_en)
print(classification_report(y_test,y_pred_test_imp_en))
print(confusion_matrix(y_test,y_pred_test_imp_en))

In [26]:
length = len(rf_en.feature_importances_)
index = range(length)
important_features=sorted(zip(index,rf_en.feature_importances_),key=lambda x:x[1],reverse=True)[:500]
feature_index = list(map(lambda x:x[0],important_features))
X_imp=X[:,feature_index]

In [28]:
X_train_imp,X_test_imp,y_train,y_test=train_test_split(X_imp,y,test_size=.3,random_state=42)

In [30]:
rf_en_new=RandomTreesEmbedding(100,n_jobs=-1)
X_train_imp_en=rf_en_new.fit_transform(X_train_imp,y_train)

In [31]:
lr_en = LogisticRegression()
lr_en.fit(X_train_imp_en,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
X_test_imp_en=rf_en_new.transform(X_test_imp)
y_pred_test_imp_en=lr_en.predict(X_test_imp_en)
print(classification_report(y_test,y_pred_test_imp_en))
print(confusion_matrix(y_test,y_pred_test_imp_en))

              precision    recall  f1-score   support

         1.0       0.57      0.97      0.72    118527
         2.0       0.56      0.04      0.08     91479

   micro avg       0.57      0.57      0.57    210006
   macro avg       0.56      0.51      0.40    210006
weighted avg       0.56      0.57      0.44    210006

[[115341   3186]
 [ 87470   4009]]


In [19]:
X_test_en.shape,X_test.shape

((15000, 9458), (15000, 18338))

In [5]:
# print('Load data...')
# df_train = pd.read_csv('data/train.csv')
# df_test = pd.read_csv('data/test.csv')

# NUMERIC_COLS = [
#     "ps_reg_01", "ps_reg_02", "ps_reg_03",
#     "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
# ]

# print(df_test.head(10))



# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 64,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# number of leaves,will be used in feature transformation
num_leaf = 64

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)
print("y_pred",y_pred)

print(np.array(y_pred).shape)
print(y_pred[:10])

print('Writing transformed training data')
# transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
#                                        dtype=np.int64)  # N * num_tress * num_leafs

transformed_training_matrix = np.zeros([len(y_pred), num_leaf],
                                       dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_training_matrix[i][temp] += 1


y_pred = gbm.predict(X_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_testing_matrix[i][temp] += 1


lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
lm.fit(transformed_training_matrix,y_train)  # fitting the data
y_pred_test = lm.predict_proba(transformed_testing_matrix)   # Give the probabilty on each label

print(y_pred_test)

NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
print("Normalized Cross Entropy " + str(NE))

Start training...




[1]	training's binary_logloss: 34.5388
[2]	training's binary_logloss: 34.5388
[3]	training's binary_logloss: 34.5388
[4]	training's binary_logloss: 34.5388
[5]	training's binary_logloss: 34.5388
[6]	training's binary_logloss: 34.5388
[7]	training's binary_logloss: 34.5388
[8]	training's binary_logloss: 34.5388
[9]	training's binary_logloss: 34.5388
[10]	training's binary_logloss: 34.5388
[11]	training's binary_logloss: 34.5388
[12]	training's binary_logloss: 34.5388
[13]	training's binary_logloss: 34.5388
[14]	training's binary_logloss: 34.5388
[15]	training's binary_logloss: 34.5388
[16]	training's binary_logloss: 34.5388
[17]	training's binary_logloss: 34.5388
[18]	training's binary_logloss: 34.5388
[19]	training's binary_logloss: 34.5388
[20]	training's binary_logloss: 34.5388
[21]	training's binary_logloss: 34.5388
[22]	training's binary_logloss: 34.5388
[23]	training's binary_logloss: 34.5388
[24]	training's binary_logloss: 34.5388
[25]	training's binary_logloss: 34.5388
[26]	trai

TypeError: object of type 'numpy.int32' has no len()

In [5]:
y_train.shape

(314581,)