In [42]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.datasets import make_classification
from lightgbm import early_stopping, log_evaluation

## 步骤4 ：多分类任务
- 使用make_classification，创建一个多分类数据集。
- 使用sklearn接口完成训练和预测。
- 使用原生train接口完成训练和预测。

In [78]:
data = make_classification(n_samples=10000,n_features=20,
                            n_informative=4,
                            n_redundant=2,
                            n_repeated=0,
                            n_classes=5,
                            n_clusters_per_class=2,
                            weights=[0.05,0.1,0.1,0.5],
                            flip_y=0.4,
                            class_sep=1.0,
                            hypercube=True,
                            shift=0.0,
                            scale=1.0,
                            shuffle=True,
                            random_state=420)

# 划分测试集和验证集
from sklearn.model_selection import train_test_split as TTS
df = pd.DataFrame(data[0])
df['label'] = data[1]
print(df.label.value_counts())

X_train, X_val, y_train, y_val = TTS(df.drop(['label'], axis=1), df.label, test_size=0.3, random_state=42)

# 使用原生的LGBM API需要将数据转换成lgbm中Datasets格式，如下：
lgb_train = lgb.Dataset(X_train, y_train)
lgb_validate = lgb.Dataset(X_val, y_val)

from sklearn.metrics import accuracy_score
#使用原生API
params_naive = {
    "learning_rate":0.1,
    "max_bin":150,
    "num_leaves":32,
    "max_depth":11,
    
    "lambda_l1":0.1,
    "lambda_l2":0.2,
    
    "objective":"multiclass",
    "num_class":5,
    
    "verbose": -1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}

model = lgb.train(params=params_naive, train_set=lgb_train, num_boost_round=300, valid_sets=[lgb_train, lgb_validate], callbacks=[early_stopping(10), log_evaluation(10)])

#Save the model
model.save_model('model_classifier_lgb,txt')

#Load the model
# model = lgb.Booster(model_file='model_classifier_lgb.txt')

#predict the X_val data
y_pred = np.argmax(model.predict(X_val), axis=1)

#metirc the model
print("The accuracy score of prediction is ", accuracy_score(y_val, y_pred))
# 正确率有点低！！

params_sklearn = {
    'learning_rate':0.1,
    'max_bin':150,
    'num_leaves':32,
    'max_depth':11,
    
    'reg_alpha':0.1,
    'reg_lambda':0.2,
    
    'objective':'multiclass',
    'n_estimators':300,
    'verbose':-1
}

watchlist = [(X_train, y_train), (X_val, y_val)]
#Using sklearn API to train the model
clf = lgb.LGBMClassifier(**params_sklearn)
clf.fit(X_train, y_train, eval_set=watchlist, callbacks=[early_stopping(10), log_evaluation(10)])

# save the model
# clf.save_model("model_classifier_sklearn.txt")

# load the model
# clf = lgb.Booster('model_classifier_sklearn.txt')

# predict the X_val data
y_pred_sklearn = clf.predict(X_val)

#metirc the model
print("The accuracy score of prediction is ", accuracy_score(y_val, y_pred_sklearn))

## 步骤3 ：二分类任务
- 使用make_classification，创建一个二分类数据集。
- 使用sklearn接口完成训练和预测。
- 使用原生train接口完成训练和预测。

In [96]:
TwoClassData = make_classification(n_samples=10000,n_features=20,
                            n_informative=2,
                            n_redundant=0,
                            n_repeated=0,
                            n_classes=2,
                            n_clusters_per_class=2,
                            # weights=[0.05,0.1,0.1,0.5],
                            flip_y=0.4,
                            class_sep=1.0,
                            hypercube=True,
                            shift=0.0,
                            scale=1.0,
                            shuffle=True,
                            random_state=420)

# 划分测试集和验证集
from sklearn.model_selection import train_test_split as TTS
TDF = pd.DataFrame(TwoClassData[0])
TDF['label'] = TwoClassData[1]
print(TDF.label.value_counts())

X_train, X_val, y_train, y_val = TTS(TDF.drop(['label'], axis=1), TDF.label, test_size=0.3, random_state=42)

# 使用原生的LGBM API需要将数据转换成lgbm中Datasets格式，如下：
lgb_train = lgb.Dataset(X_train, y_train)
lgb_validate = lgb.Dataset(X_val, y_val)

from sklearn.metrics import accuracy_score
#使用原生API
params_naive = {
    "learning_rate":0.1,
    "max_bin":150,
    "num_leaves":32,
    "max_depth":11,
    
    "lambda_l1":0.1,
    "lambda_l2":0.2,
    
    "objective":"binary",
    # "num_class":2,
    
    "verbose": -1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}

model = lgb.train(params=params_naive, train_set=lgb_train, num_boost_round=300, valid_sets=[lgb_train, lgb_validate], callbacks=[early_stopping(10), log_evaluation(10)])

#Save the model
model.save_model('model_classifier_lgb,txt')

#Load the model
# model = lgb.Booster(model_file='model_classifier_lgb.txt')

#predict the X_val data
y_pred = model.predict(X_val)
y_pred = y_pred.flatten()
# extract the predicted class labels
y_pred = np.where(y_pred > 0.5, 1, 0)

#metirc the model
print("The accuracy score of prediction is ", accuracy_score(y_val, y_pred))
# 正确率有点低！！

params_sklearn = {
    'learning_rate':0.1,
    'max_bin':150,
    'num_leaves':32,
    'max_depth':11,
    
    'reg_alpha':0.1,
    'reg_lambda':0.2,
    
    'objective':'binary',
    'n_estimators':300,
    'verbose':-1
}

watchlist = [(X_train, y_train), (X_val, y_val)]
#Using sklearn API to train the model
clf = lgb.LGBMClassifier(**params_sklearn)
clf.fit(X_train, y_train, eval_set=watchlist, callbacks=[early_stopping(10), log_evaluation(10)])

# save the model
# clf.save_model("model_classifier_sklearn.txt")

# load the model
# clf = lgb.Booster('model_classifier_sklearn.txt')

# predict the X_val data
y_pred_sklearn = clf.predict(X_val)

#metirc the model
print("The accuracy score of prediction is ", accuracy_score(y_val, y_pred_sklearn))

## 步骤5 ：回归任务
- 使用make_regression，创建一个回归数据集。
- 使用sklearn接口完成训练和预测。
- 使用原生train接口完成训练和预测。

In [112]:
from sklearn.datasets import make_regression
X, Y = make_regression(n_samples=10000, n_features=20, n_targets=1, noise=1.5, random_state=420)

X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

X_train, X_val, y_test, y_val = TTS(X, Y, test_size=0.3, random_state=420, shuffle=True)

from sklearn.metrics import mean_absolute_error as mae
#LGB API
params_lgb = {
    'boosting_type': 'gbdt',
    'objective':'mae',
    'n_jobs':8,
    'subsample': 0.5,
    'subsample_freq': 1,
    'learning_rate': 0.01,
    'num_leaves': 2**11-1,
    'min_data_in_leaf': 2**12-1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'n_estimators': 2500,
    'boost_from_average': False,
    "random_seed":420,
    "verbose": -1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}

lgb_train = lgb.Dataset(X_train, y_train)
lgb_validate = lgb.Dataset(X_val, y_val)


reg_lgb = lgb.train(params=params_lgb, train_set=lgb_train, valid_sets=[lgb_train, lgb_validate], num_boost_round=300,callbacks=[early_stopping(10), log_evaluation(10)])

# save the model
reg_lgb.save_model('reg_lgb.txt')

# load the model
# reg_lgb = lgb.Booster('reg_lgb.txt')

# predict the result 
y_pred = reg_lgb.predict(X_val)

#metrics the model
print("The mse of the model is",mae(y_val, y_pred)**0.5)

params_sklearn = {
                'boosting_type': 'gbdt',
                'objective':'mae',
                'n_jobs':8,
                'subsample': 0.5,
                'subsample_freq': 1,
                'learning_rate': 0.01,
                'num_leaves': 2**11-1,
                'min_data_in_leaf': 2**12-1,
                'feature_fraction': 0.5,
                'max_bin': 100,
                'n_estimators': 2500,
                'boost_from_average': False,
                "random_seed":420,
                "verbose": -1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
reg_sklearn = lgb.LGBMRegressor(**params_sklearn)
reg_sklearn.fit(X_train, y_train, 
            eval_set=[(X_val, y_val)],  
            callbacks=[early_stopping(100), log_evaluation(100)])
y_pred_sklearn = reg_sklearn.predict(X_val)
print("The mse of the model is",mae(y_val, y_pred_sklearn)**0.5)