In [1]:
import os
import pandas as pd
import lightgbm as lgb

In [12]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from scipy.stats import randint, uniform

In [18]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [3]:
os.listdir("./data")

['preprocessing_test.csv',
 'preprocessing_train.csv',
 'sample_submission.csv',
 'test.csv',
 'train.csv']

In [5]:
train = pd.read_csv("./data/preprocessing_train.csv")
test = pd.read_csv("./data/preprocessing_test.csv")

In [43]:
submission = pd.read_csv('./data/sample_submission.csv')

In [8]:
train.drop("Unnamed: 0", axis=1, inplace=True)
test.drop("Unnamed: 0", axis=1, inplace=True)

In [10]:
train.drop("index", axis=1, inplace=True)
test.drop("index", axis=1, inplace=True)

In [13]:
data = train.drop("credit", axis=1)
target = train.credit

In [14]:
x_train, x_valid, y_train, y_valid = train_test_split(data, target, stratify=target, random_state=123)

In [81]:
params_0 = {
    "early_stopping_rounds" : 50,
    "eval_metric" : "logloss",
    "eval_set" : [(x_valid, y_valid)],
    "eval_names" : ['valid'],
    "verbose" : 100,
}

In [87]:
params_1 = {
    "num_leaves" : randint(10, 150),
    "min_child_samples": randint(10, 500),
    "min_child_weight": [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    "subsample": uniform(loc=0.2, scale=0.8),
    "colsample_bytree": uniform(loc=0.2, scale=0.8),
    "reg_alpha": [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    "reg_lambda": [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    "n_estimators": randint(100, 5000)
}

In [88]:
cv = RepeatedStratifiedKFold(n_repeats=5,random_state=123)

In [89]:
clf = lgb.LGBMClassifier(max_depth=-1, random_state=123, silent=True, n_jobs=-1)

In [90]:
rs = RandomizedSearchCV(
    estimator=clf,
    param_distributions=params_1,
    cv=cv,
    refit=True,
    random_state=123,
    verbose=True,
    n_jobs=-1,
    n_iter=200
)

In [91]:
rs.fit(x_train, y_train, **params_0)

Fitting 25 folds for each of 200 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 56.4min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 85.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 118.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 149.4min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 190.4min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 241.3min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed: 242.5min finished


Training until validation scores don't improve for 50 rounds.
[100]	valid's multi_logloss: 0.625838	valid's multi_logloss: 0.625838
[200]	valid's multi_logloss: 0.62248	valid's multi_logloss: 0.62248
Early stopping, best iteration is:
[155]	valid's multi_logloss: 0.621025	valid's multi_logloss: 0.621025


RandomizedSearchCV(cv=<sklearn.model_selection._split.RepeatedStratifiedKFold object at 0x00000292E4E45780>,
                   error_score='raise-deprecating',
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimato...
                                        'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000292E4B612E8>,
                                        'reg_alpha': [0, 0.1, 1, 2, 5, 7, 10,
                                  

In [94]:
rs.best_params_

{'colsample_bytree': 0.5827241508249024,
 'min_child_samples': 58,
 'min_child_weight': 0.001,
 'n_estimators': 391,
 'num_leaves': 131,
 'reg_alpha': 0,
 'reg_lambda': 10,
 'subsample': 0.3536241149711741}

In [95]:
params_2 = rs.best_params_

In [96]:
lgb_clf = lgb.LGBMClassifier(**clf.get_params())

In [97]:
lgb_clf.set_params(**params_2)

LGBMClassifier(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.5827241508249024, importance_type='split',
               learning_rate=0.1, max_depth=-1, min_child_samples=58,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=391,
               n_jobs=-1, num_leaves=131, objective=None, random_state=123,
               reg_alpha=0, reg_lambda=10, silent=True,
               subsample=0.3536241149711741, subsample_for_bin=200000,
               subsample_freq=0)

In [98]:
gs = GridSearchCV(
    estimator=lgb_clf,
    refit=True,
    cv=cv,
    verbose=True,
    param_grid={"scale_pos_weight" : [2,4,6,8,10,12,14], "learning_rate": [0.1,0.01,0.03,0.05]}
)

In [1]:
gs.fit(x_train, y_train, **params_0)

In [100]:
gs.best_params_

{'learning_rate': 0.1, 'scale_pos_weight': 2}

In [106]:
model = lgb.LGBMClassifier()

In [107]:
model.fit(train.drop("credit", axis=1), train.credit)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [108]:
pred = model.predict_proba(test)

In [109]:
submission[["0","1","2"]] = pred

In [110]:
submission.to_csv("lgb4_submission.csv", index=False)