In [6]:
# ライブラリのインポート
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import matplotlib.pyplot as plt

In [7]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
df = pd.read_csv(data_url, sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [8]:
# 特徴量とラベルに分割
X = df.drop('quality', axis=1)
y = df['quality']

# クラスの値を調整
y -= 3

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM用のデータセットに変換
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# LightGBMのハイパーパラメータの設定
params = {
    'objective': 'multiclass', # 多クラス分類
    'num_class': 7, # クラスの数
    'metric': 'multi_logloss' # 損失関数にmulti_loglossを使用
}
verbose_eval = 1 # この数字を1にすると学習時のスコア推移がコマンドライン表示される

# LightGBMモデルの学習
model = lgb.train(params, train_data, num_boost_round=1000, 
                  valid_sets=[train_data, test_data],
                  callbacks=[lgb.early_stopping(stopping_rounds=10, 
                                verbose=True), # early_stopping用コールバック関数
                           lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数)
)
# テストデータでの予測
y_pred = model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1) + 3 # 予測結果のクラスの値を調整
y_test += 3 # テストデータのクラスの値を調整
# 精度の評価
accuracy = accuracy_score(y_test, y_pred_class)
print('Accuracy:', accuracy)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1313
[LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 11
[LightGBM] [Info] Start training from score -5.565286
[LightGBM] [Info] Start training from score -3.346083
[LightGBM] [Info] Start training from score -1.212002
[LightGBM] [Info] Start training from score -0.796864
[LightGBM] [Info] Start training from score -1.739548
[LightGBM] [Info] Start training from score -3.331694
[LightGBM] [Info] Start training from score -6.663899
[1]	training's multi_logloss: 1.18671	valid_1's multi_logloss: 1.22753
Training until validation scores don't improve for 10 rounds
[2]	training's multi_logloss: 1.11397	valid_1's multi_logloss: 1.18303
[3]	training's multi_logloss: 1.05689	valid_1's multi_logloss: 1.15247
[4]	training's multi_logloss: 1.00895	valid_1's multi_logloss: 1.12565
[5]	training's multi_logloss: