In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgbm
from sklearn.model_selection import KFold
import warnings
warnings.simplefilter('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/30-days-of-ml/train.csv")
test = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv")
sub = pd.read_csv("/kaggle/input/30-days-of-ml/sample_submission.csv")

In [3]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [4]:
cat_cols = [col for col in train.columns if 'cat' in col]
X = train.drop(["target", "id"], axis=1)
X_test = test.copy()
X_test.drop(['id'],axis = 1, inplace = True)
y = train.target

In [5]:
SEED = 7770777
kf = KFold(n_splits = 5, shuffle = True, random_state = SEED)

In [6]:
params_lgbm = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.007899156646724397,
    "num_leaves": 77,
    "max_depth": 77,
    "feature_fraction": 0.2256038826485174,
    "bagging_fraction": 0.7705303688019942,
    "min_child_samples": 290,
    "reg_alpha": 9.562925363678952,
    "reg_lambda": 9.355810045480153,
    "max_bin": 772,
    "min_data_per_group": 177,
    "bagging_freq": 1,
    "cat_smooth": 96,
    "cat_l2": 17,
    "verbosity": -1,
    "bagging_seed": SEED,
    "feature_fraction_seed": SEED,
    "seed": SEED
}

In [7]:
preds_lgbm = pd.DataFrame()
X[cat_cols] = X[cat_cols].astype("category")
X_test[cat_cols] = X_test[cat_cols].astype("category")
X.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,C,N,...,0.610706,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985
1,B,B,A,A,B,D,A,F,A,O,...,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083
2,A,A,A,C,B,D,A,D,A,F,...,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846
3,B,B,A,C,B,D,A,E,C,K,...,0.284667,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682
4,A,A,A,C,B,D,A,E,A,N,...,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823


In [8]:
for k, (tr_id, vl_id) in enumerate(kf.split(X, y)):
    print("=" * 50)
    print(f"                   KFold {k+1}")
    print("=" * 50)
    
    X_train, X_valid = X.iloc[tr_id, :], X.iloc[vl_id, :]
    y_train, y_valid = y.iloc[tr_id], y.iloc[vl_id]
    
    lgbm_train = lgbm.Dataset(X_train, y_train)
    lgbm_valid = lgbm.Dataset(X_valid, y_valid)
    
    model = lgbm.train(params = params_lgbm,
                      train_set = lgbm_train,
                      valid_sets = lgbm_valid,
                      num_boost_round = 100000,
                      early_stopping_rounds = 200,
                      verbose_eval = 1000)
    
    pred_lgbm = model.predict(X_test, num_iteration = model.best_iteration)
    pred_lgbm = pd.DataFrame(pred_lgbm)
    
    #Concatenate predictions
    preds_lgbm = pd.concat([preds_lgbm, pred_lgbm], axis = 1)

                   KFold 1
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's rmse: 0.727185
[2000]	valid_0's rmse: 0.723175
[3000]	valid_0's rmse: 0.722125
[4000]	valid_0's rmse: 0.721895
Early stopping, best iteration is:
[4003]	valid_0's rmse: 0.721894
                   KFold 2
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's rmse: 0.723547
[2000]	valid_0's rmse: 0.719483
[3000]	valid_0's rmse: 0.718293
[4000]	valid_0's rmse: 0.717966
Early stopping, best iteration is:
[4523]	valid_0's rmse: 0.71792
                   KFold 3
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's rmse: 0.721631
[2000]	valid_0's rmse: 0.717691
[3000]	valid_0's rmse: 0.716619
[4000]	valid_0's rmse: 0.716326
Early stopping, best iteration is:
[4190]	valid_0's rmse: 0.716301
                   KFold 4
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's rmse: 0.722043
[2000]	valid_0's rmse: 0.717

In [10]:
label = preds_lgbm.mean(axis=1)
X = pd.concat([X, X_test], axis=0).reset_index(drop=True)
y = pd.concat([y, label], axis=0).reset_index(drop=True)

print("X.shape: ", X.shape)
print("y.shape: ", y.shape)

X.shape:  (500000, 24)
y.shape:  (500000,)


In [11]:
X.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,C,N,...,0.610706,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985
1,B,B,A,A,B,D,A,F,A,O,...,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083
2,A,A,A,C,B,D,A,D,A,F,...,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846
3,B,B,A,C,B,D,A,E,C,K,...,0.284667,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682
4,A,A,A,C,B,D,A,E,A,N,...,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823


In [12]:
y.head()

0    8.113634
1    8.481233
2    8.364351
3    8.049253
4    7.972260
dtype: float64

In [13]:
for k, (tr_id, vl_id) in enumerate(kf.split(X, y)):
    print("=" * 50)
    print(f"                   KFold {k+1}")
    print("=" * 50)
    
    X_train, X_valid = X.iloc[tr_id, :], X.iloc[vl_id, :]
    y_train, y_valid = y.iloc[tr_id], y.iloc[vl_id]
    
    lgbm_train = lgbm.Dataset(X_train, y_train)
    lgbm_valid = lgbm.Dataset(X_valid, y_valid)
    
    model = lgbm.train(params = params_lgbm,
                      train_set = lgbm_train,
                      valid_sets = lgbm_valid,
                      num_boost_round = 100000,
                      early_stopping_rounds = 200,
                      verbose_eval = 1000)
    
    pred_lgbm = model.predict(X_test, num_iteration = model.best_iteration)
    pred_lgbm = pd.DataFrame(pred_lgbm)
    
    #Concatenate predictions
    preds_lgbm = pd.concat([preds_lgbm, pred_lgbm], axis = 1)

                   KFold 1
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's rmse: 0.563906
[2000]	valid_0's rmse: 0.558692
[3000]	valid_0's rmse: 0.557079
[4000]	valid_0's rmse: 0.556497
[5000]	valid_0's rmse: 0.55625
[6000]	valid_0's rmse: 0.556136
[7000]	valid_0's rmse: 0.556105
Early stopping, best iteration is:
[6828]	valid_0's rmse: 0.556101
                   KFold 2
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's rmse: 0.561963
[2000]	valid_0's rmse: 0.556658
[3000]	valid_0's rmse: 0.554899
[4000]	valid_0's rmse: 0.554181
[5000]	valid_0's rmse: 0.553828
[6000]	valid_0's rmse: 0.553651
[7000]	valid_0's rmse: 0.553574
Early stopping, best iteration is:
[7167]	valid_0's rmse: 0.553569
                   KFold 3
Training until validation scores don't improve for 200 rounds
[1000]	valid_0's rmse: 0.562258
[2000]	valid_0's rmse: 0.557181
[3000]	valid_0's rmse: 0.555572
[4000]	valid_0's rmse: 0.554954
[5000]	valid_0's rmse: 0

In [16]:
pred = preds_lgbm.mean(axis = 1)
sub.target = pred

sub.head()

Unnamed: 0,id,target
0,0,8.071174
1,5,8.339325
2,15,8.391282
3,16,8.463162
4,17,8.137702


In [18]:
sub.to_csv("submission_lgbm.csv", index=False)