Skip to content

Commit

Permalink
[ci] [python-package] enable ruff-format on tests and examples (#6317)
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb committed Feb 21, 2024
1 parent b60068c commit 1b792e7
Show file tree
Hide file tree
Showing 30 changed files with 3,230 additions and 3,857 deletions.
14 changes: 8 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,21 @@ exclude: |
)$
repos:
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.2.1
hooks:
# Run the linter.
- id: ruff
args: ["--config", "python-package/pyproject.toml"]
types_or: [python, jupyter]
# Run the formatter.
- id: ruff-format
args: ["--config", "python-package/pyproject.toml"]
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
types_or: [python, jupyter]
180 changes: 88 additions & 92 deletions examples/python-guide/advanced_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@

import lightgbm as lgb

print('Loading data...')
print("Loading data...")
# load or create your dataset
binary_example_dir = Path(__file__).absolute().parents[1] / 'binary_classification'
df_train = pd.read_csv(str(binary_example_dir / 'binary.train'), header=None, sep='\t')
df_test = pd.read_csv(str(binary_example_dir / 'binary.test'), header=None, sep='\t')
W_train = pd.read_csv(str(binary_example_dir / 'binary.train.weight'), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / 'binary.test.weight'), header=None)[0]
binary_example_dir = Path(__file__).absolute().parents[1] / "binary_classification"
df_train = pd.read_csv(str(binary_example_dir / "binary.train"), header=None, sep="\t")
df_test = pd.read_csv(str(binary_example_dir / "binary.test"), header=None, sep="\t")
W_train = pd.read_csv(str(binary_example_dir / "binary.train.weight"), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / "binary.test.weight"), header=None)[0]

y_train = df_train[0]
y_test = df_test[0]
Expand All @@ -27,72 +27,72 @@

# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)

# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
"boosting_type": "gbdt",
"objective": "binary",
"metric": "binary_logloss",
"num_leaves": 31,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"verbose": 0,
}

# generate feature names
feature_name = [f'feature_{col}' for col in range(num_feature)]
feature_name = [f"feature_{col}" for col in range(num_feature)]

print('Starting training...')
print("Starting training...")
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])

print('Finished first 10 rounds...')
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21],
)

print("Finished first 10 rounds...")
# check feature name
print(f'7th feature name is: {lgb_train.feature_name[6]}')
print(f"7th feature name is: {lgb_train.feature_name[6]}")

print('Saving model...')
print("Saving model...")
# save model to file
gbm.save_model('model.txt')
gbm.save_model("model.txt")

print('Dumping model to JSON...')
print("Dumping model to JSON...")
# dump model to JSON (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
with open("model.json", "w+") as f:
json.dump(model_json, f, indent=4)

# feature names
print(f'Feature names: {gbm.feature_name()}')
print(f"Feature names: {gbm.feature_name()}")

# feature importances
print(f'Feature importances: {list(gbm.feature_importance())}')
print(f"Feature importances: {list(gbm.feature_importance())}")

print('Loading model to predict...')
print("Loading model to predict...")
# load model to predict
bst = lgb.Booster(model_file='model.txt')
bst = lgb.Booster(model_file="model.txt")
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
auc_loaded_model = roc_auc_score(y_test, y_pred)
print(f"The ROC AUC of loaded model's prediction is: {auc_loaded_model}")

print('Dumping and loading model with pickle...')
print("Dumping and loading model with pickle...")
# dump model with pickle
with open('model.pkl', 'wb') as fout:
with open("model.pkl", "wb") as fout:
pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
with open("model.pkl", "rb") as fin:
pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
Expand All @@ -104,46 +104,46 @@
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model="model.txt", valid_sets=lgb_eval)

print('Finished 10 - 20 rounds with model file...')
print("Finished 10 - 20 rounds with model file...")

# decay learning rates
# reset_parameter callback accepts:
# 1. list with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99 ** iter))])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99**iter))],
)

print('Finished 20 - 30 rounds with decay learning rates...')
print("Finished 20 - 30 rounds with decay learning rates...")

# change other parameters during training
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)],
)

print('Finished 30 - 40 rounds with changing bagging_fraction...')
print("Finished 30 - 40 rounds with changing bagging_fraction...")


# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
hess = preds * (1.0 - preds)
return grad, hess


Expand All @@ -156,22 +156,19 @@ def loglikelihood(preds, train_data):
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'error', np.mean(labels != (preds > 0.5)), False
preds = 1.0 / (1.0 + np.exp(-preds))
return "error", np.mean(labels != (preds > 0.5)), False


# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood
params_custom_obj["objective"] = loglikelihood

gbm = lgb.train(params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=binary_error,
valid_sets=lgb_eval)
gbm = lgb.train(
params_custom_obj, lgb_train, num_boost_round=10, init_model=gbm, feval=binary_error, valid_sets=lgb_eval
)

print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
print("Finished 40 - 50 rounds with self-defined objective function and eval metric...")


# another self-defined eval metric
Expand All @@ -183,42 +180,41 @@ def binary_error(preds, train_data):
# Keep this in mind when you use the customization
def accuracy(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'accuracy', np.mean(labels == (preds > 0.5)), True
preds = 1.0 / (1.0 + np.exp(-preds))
return "accuracy", np.mean(labels == (preds > 0.5)), True


# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood
params_custom_obj["objective"] = loglikelihood

gbm = lgb.train(params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=[binary_error, accuracy],
valid_sets=lgb_eval)
gbm = lgb.train(
params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=[binary_error, accuracy],
valid_sets=lgb_eval,
)

print('Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...')
print("Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...")

print('Starting a new training job...')
print("Starting a new training job...")


# callback
def reset_metrics():
def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new_valid')
print("Add a new valid dataset at iteration 5...")
env.model.add_valid(lgb_eval_new, "new_valid")

callback.before_iteration = True
callback.order = 0
return callback


gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, callbacks=[reset_metrics()])

print('Finished first 10 rounds with callback function...')
print("Finished first 10 rounds with callback function...")
23 changes: 6 additions & 17 deletions examples/python-guide/dask/ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
if __name__ == "__main__":
print("loading data")

rank_example_dir = Path(__file__).absolute().parents[2] / 'lambdarank'
X, y = load_svmlight_file(str(rank_example_dir / 'rank.train'))
group = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
rank_example_dir = Path(__file__).absolute().parents[2] / "lambdarank"
X, y = load_svmlight_file(str(rank_example_dir / "rank.train"))
group = np.loadtxt(str(rank_example_dir / "rank.train.query"))

print("initializing a Dask cluster")

Expand All @@ -32,25 +32,14 @@
# a sparse boundary to partition the data
X = X.toarray()

dX = da.from_array(
x=X,
chunks=[
(rows_in_part1, rows_in_part2),
(num_features,)
]
)
dX = da.from_array(x=X, chunks=[(rows_in_part1, rows_in_part2), (num_features,)])
dy = da.from_array(
x=y,
chunks=[
(rows_in_part1, rows_in_part2),
]
)
dg = da.from_array(
x=group,
chunks=[
(100, group.size - 100)
]
],
)
dg = da.from_array(x=group, chunks=[(100, group.size - 100)])

print("beginning training")

Expand Down

0 comments on commit 1b792e7

Please sign in to comment.