In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
import warnings
warnings.filterwarnings("ignore")


Mounted at /content/drive


In [4]:
import lightgbm as lgb

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [5]:
data = pd.read_csv("/content/drive/My Drive/6893project/data/merged.csv")
NUMERIC_COLS = [        
        'energy',
        'key',
        'loudness',
        'speechiness',
        'instrumentalness',
        'liveness',
        'valence',
        'tempo'
]
data_x = data[NUMERIC_COLS]
data_y = data['target']
X_train, X_test, y_train, y_test = train_test_split(
        data_x, data_y, test_size=0.2, random_state=42)

In [7]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

400
100
400
100


In [9]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 64,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# number of leaves,will be used in feature transformation
num_leaf = 64

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train)

print('Save model...')
# save model to file
gbm.save_model('/content/drive/My Drive/6893project/code/gbdt.txt')

print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)

print(np.array(y_pred).shape)
print(y_pred[:10])

print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
                                       dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_training_matrix[i][temp] += 1

y_pred = gbm.predict(X_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_testing_matrix[i][temp] += 1


Start training...
[1]	training's binary_logloss: 0.31865
[2]	training's binary_logloss: 0.3179
[3]	training's binary_logloss: 0.317055
[4]	training's binary_logloss: 0.316223
[5]	training's binary_logloss: 0.315418
[6]	training's binary_logloss: 0.314614
[7]	training's binary_logloss: 0.313803
[8]	training's binary_logloss: 0.31307
[9]	training's binary_logloss: 0.312279
[10]	training's binary_logloss: 0.311563
[11]	training's binary_logloss: 0.310552
[12]	training's binary_logloss: 0.309607
[13]	training's binary_logloss: 0.308771
[14]	training's binary_logloss: 0.307959
[15]	training's binary_logloss: 0.307156
[16]	training's binary_logloss: 0.306312
[17]	training's binary_logloss: 0.305491
[18]	training's binary_logloss: 0.304693
[19]	training's binary_logloss: 0.303694
[20]	training's binary_logloss: 0.303031
[21]	training's binary_logloss: 0.302149
[22]	training's binary_logloss: 0.30131
[23]	training's binary_logloss: 0.300255
[24]	training's binary_logloss: 0.299502
[25]	trainin

In [10]:
lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
lm.fit(transformed_training_matrix,y_train)  # fitting the data
y_pred_test = lm.predict_proba(transformed_testing_matrix)   # Give the probabilty on each label

print(y_pred_test)

NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
print("Normalized Cross Entropy " + str(NE))

[[0.94793591 0.05206409]
 [0.98463745 0.01536255]
 [0.9676838  0.0323162 ]
 [0.97153896 0.02846104]
 [0.97559565 0.02440435]
 [0.98644639 0.01355361]
 [0.98037552 0.01962448]
 [0.95856307 0.04143693]
 [0.94335189 0.05664811]
 [0.85328831 0.14671169]
 [0.90693626 0.09306374]
 [0.85993497 0.14006503]
 [0.96007916 0.03992084]
 [0.97065377 0.02934623]
 [0.98554175 0.01445825]
 [0.91187782 0.08812218]
 [0.97341373 0.02658627]
 [0.83126143 0.16873857]
 [0.96338801 0.03661199]
 [0.94920724 0.05079276]
 [0.63031071 0.36968929]
 [0.98644639 0.01355361]
 [0.62751535 0.37248465]
 [0.96794797 0.03205203]
 [0.95369478 0.04630522]
 [0.85993497 0.14006503]
 [0.70232961 0.29767039]
 [0.95339186 0.04660814]
 [0.83918815 0.16081185]
 [0.95333779 0.04666221]
 [0.83806058 0.16193942]
 [0.95500191 0.04499809]
 [0.97059082 0.02940918]
 [0.99300885 0.00699115]
 [0.91868185 0.08131815]
 [0.98198822 0.01801178]
 [0.89362278 0.10637722]
 [0.92392428 0.07607572]
 [0.98051233 0.01948767]
 [0.98589794 0.01410206]
