In [1]:
import lightgbm
import pkgutil
import os
import importlib, inspect
import warnings
warnings.filterwarnings("ignore")
import pickle

save_dir = "./models/lightgbm/{0}".format(lightgbm.__version__)
# print(save_dir)
os.mkdir(save_dir)

FileExistsError: [Errno 17] File exists: './models/lightgbm/3.3.5'

In [2]:
lightgbm.__all__

['Dataset',
 'Booster',
 'CVBooster',
 'Sequence',
 'register_logger',
 'train',
 'cv',
 'LGBMModel',
 'LGBMRegressor',
 'LGBMClassifier',
 'LGBMRanker',
 'DaskLGBMRegressor',
 'DaskLGBMClassifier',
 'DaskLGBMRanker',
 'log_evaluation',
 'print_evaluation',
 'record_evaluation',
 'reset_parameter',
 'early_stopping',
 'plot_importance',
 'plot_split_value_histogram',
 'plot_metric',
 'plot_tree',
 'create_tree_digraph']

# Generate lightgbm model samples for rocket

## Binary Classification

In [3]:
compas_training = pickle.load(open("./data/pickle_pandas_tabular_compas_training.sav", "rb"))
compas_testing = pickle.load(open("./data/pickle_pandas_tabular_compas_testing.sav", "rb"))

X_train = compas_training.drop("two_year_recid", axis=1)
y_train = compas_training[["two_year_recid"]]
X_test = compas_testing.drop("two_year_recid", axis=1)
y_test = compas_testing[["two_year_recid"]]

### Booster - binary

In [4]:
dtrain = lightgbm.Dataset(X_train, label=y_train)
dtest = lightgbm.Dataset(X_test, label=y_test)

param = {
    'num_leaves': 31, 
    'objective': 'binary'
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/binary_classification_compas_binary_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

[LightGBM] [Info] Number of positive: 2244, number of negative: 2693
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 44
[LightGBM] [Info] Number of data points in the train set: 4937, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.454527 -> initscore=-0.182396
[LightGBM] [Info] Start training from score -0.182396
type: <class 'lightgbm.basic.Booster'>


### Booster - cross-entropy

In [5]:
param = {
    'num_leaves': 31, 
    'objective': 'cross-entropy'
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/binary_classification_compas_crossentropy_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))

[LightGBM] [Fatal] Unknown objective type name: cross-entropy


LightGBMError: Unknown objective type name: cross-entropy

### LGBMClassifier

In [6]:
classifier = lightgbm.LGBMClassifier().fit(X_train, y_train)  # train using booster class
predictions = classifier.predict(X_test)
print("type: {0}".format(type(classifier)))
print("score: {0}".format(classifier.score(X_test, y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.sklearn.LGBMClassifier\n")
pickle.dump(classifier, open("{0}/binary_classification_compas_lightgbm.sklearn.LGBMClassifier.sav".format(save_dir), "wb+"))

type: <class 'lightgbm.sklearn.LGBMClassifier'>
score: 0.6647773279352227


### DaskLGBMClassifier

In [4]:
import dask, pandas
classifier = lightgbm.DaskLGBMClassifier().fit(X_train, y_train)  # train using booster class
predictions = classifier.predict(X_test)
print("type: {0}".format(type(classifier)))
print("score: {0}".format(classifier.score(X_test, y_test)))

# with open("algorithm.txt", "a+") as f:
#     f.write("lightgbm.sklearn.LGBMClassifier\n")
# pickle.dump(classifier, open("./models/binary_classification_compas_lightgbm.sklearn.LGBMClassifier.sav", "wb+"))

LightGBMError: dask, pandas and scikit-learn are required for lightgbm.dask

## Multiclass Classification

In [7]:
loan_training = pickle.load(open("./data/pickle_pandas_tabular_loan_training.sav", "rb")).sample(5000)
loan_testing = pickle.load(open("./data/pickle_pandas_tabular_loan_testing.sav", "rb")).sample(2000)

multiclass_X_train = loan_training.drop("Interest_Rate", axis=1)
multiclass_y_train = loan_training[["Interest_Rate"]]
multiclass_X_test = loan_testing.drop("Interest_Rate", axis=1)
multiclass_y_test = loan_testing[["Interest_Rate"]]

### Booster - softmax

In [8]:
dtrain = lightgbm.Dataset(multiclass_X_train, label=multiclass_y_train)
dtest = lightgbm.Dataset(multiclass_X_test, label=multiclass_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'softmax',
    'num_class': 3,
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/multiclass_classification_loan_softmax_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 5000, number of used features: 12
[LightGBM] [Info] Start training from score -1.589635
[LightGBM] [Info] Start training from score -0.853786
[LightGBM] [Info] Start training from score -0.993712
type: <class 'lightgbm.basic.Booster'>


### Booster - ova

In [9]:
dtrain = lightgbm.Dataset(multiclass_X_train, label=multiclass_y_train)
dtest = lightgbm.Dataset(multiclass_X_test, label=multiclass_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'ova',
    'num_class': 3,
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/multiclass_classification_loan_onevsall_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

[LightGBM] [Info] Number of positive: 1020, number of negative: 3980
[LightGBM] [Info] Number of positive: 2129, number of negative: 2871
[LightGBM] [Info] Number of positive: 1851, number of negative: 3149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 5000, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.204000 -> initscore=-1.361479
[LightGBM] [Info] Start training from score -1.361479
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.425800 -> initscore=-0.299008
[LightGBM] [Info] Start training from score -0.299008
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370200 -> initscore=-0.531359
[LightGBM] [Info] Start training from score -0.531359
type: <class 'lightgbm.basic.Booster'>


### LGBMClassifier

In [10]:
classifier = lightgbm.LGBMClassifier().fit(multiclass_X_train, multiclass_y_train)  # train using booster class
predictions = classifier.predict(multiclass_X_test)
print("type: {0}".format(type(classifier)))
print("score: {0}".format(classifier.score(multiclass_X_test, multiclass_y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.sklearn.LGBMClassifier\n")
pickle.dump(classifier, open("{0}/multiclass_classification_loan_lightgbm.sklearn.LGBMClassifier.sav".format(save_dir), "wb+"))

type: <class 'lightgbm.sklearn.LGBMClassifier'>
score: 0.5035


## Regression

In [11]:
insurance_training = pickle.load(open("./data/pickle_pandas_tabular_insurance_training.sav", "rb"))
insurance_testing = pickle.load(open("./data/pickle_pandas_tabular_insurance_testing.sav", "rb"))
insurance_testing.describe()

regression_X_train = insurance_training.drop("charges", axis=1)
regression_y_train = insurance_training[["charges"]]
regression_X_test = insurance_testing.drop("charges", axis=1)
regression_y_test = insurance_testing[["charges"]]

### Booster - regression

In [12]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'regression',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/regression_insurance_mse_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 13296.106303
type: <class 'lightgbm.basic.Booster'>


### Booster - regression_l1

In [13]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'regression_l1',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/regression_insurance_mae_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 9301.893555
type: <class 'lightgbm.basic.Booster'>


### Booster - Huber loss

In [14]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'huber',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/regression_insurance_huber_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 13296.106303
type: <class 'lightgbm.basic.Booster'>


### Booster - Fair

In [15]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'fair',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/regression_insurance_fair_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 13296.106303
type: <class 'lightgbm.basic.Booster'>


### Booster - Poisson

In [16]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'poisson',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/regression_insurance_poisson_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 9.495227
type: <class 'lightgbm.basic.Booster'>


### Booster - Quantile

In [17]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'quantile',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/regression_insurance_quantile_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 35069.371094
type: <class 'lightgbm.basic.Booster'>


### Booster - MAPE

In [18]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'mape',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{0}/regression_insurance_mape_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 3756.621582
type: <class 'lightgbm.basic.Booster'>


### Booster - Gamma

In [22]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'gamma',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("{/regression_insurance_gamma_lightgbm.basic.Booster.sav".format(save_dir), "wb+"))
# bst.save_model("test.txt")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 9.495227
type: <class 'lightgbm.basic.Booster'>


### Booster - Tweedie

In [23]:
dtrain = lightgbm.Dataset(regression_X_train, label=regression_y_train)
dtest = lightgbm.Dataset(regression_X_test, label=regression_y_test)

param = {
    'num_leaves': 31, 
    'objective': 'tweedie',
}

num_round = 50
bst = lightgbm.train(param, dtrain, num_round)
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.basic.Booster\n")

pickle.dump(bst, open("./models/regression_insurance_tweedie_lightgbm.basic.Booster.sav", "wb+"))
# bst.save_model("test.txt")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 6
[LightGBM] [Info] Start training from score 9.495227
type: <class 'lightgbm.basic.Booster'>


### LGBMRegressor

In [24]:
regressor = lightgbm.LGBMRegressor().fit(regression_X_train, regression_y_train)  # train using booster class
predictions = regressor.predict(regression_X_test)
print("type: {0}".format(type(regressor)))
print("score: {0}".format(regressor.score(regression_X_test, regression_y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("lightgbm.sklearn.LGBMRegressor\n")
pickle.dump(regressor, open("./models/regression_insurance_lightgbm.sklearn.LGBMRegressor.sav", "wb+"))

type: <class 'lightgbm.sklearn.LGBMRegressor'>
score: 0.800133848542777
