In [1]:
import lightgbm as lgb
import optuna.integration.lightgbm as oplgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,target
0,1,0.67039,0.8113,0.643968,0.291791,0.284117,0.855953,0.8907,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411,7.243043
1,3,0.388053,0.621104,0.686102,0.501149,0.64379,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484,8.203331
2,4,0.83495,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047,7.776091
3,5,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528,6.957716
4,8,0.935278,0.421235,0.303801,0.880214,0.66561,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772,7.951046


In [3]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,0,0.3536,0.73878,0.600939,0.293377,0.285691,0.458006,0.620704,0.422249,0.369203,0.435727,0.55054,0.699134,0.286864,0.364515
1,2,0.907222,0.189756,0.215531,0.869915,0.301333,0.528958,0.390351,0.521112,0.794779,0.79858,0.446475,0.449037,0.916964,0.513002
2,6,0.179287,0.355353,0.623972,0.437812,0.282476,0.320826,0.386789,0.776422,0.222268,0.229102,0.211913,0.222651,0.327164,0.827941
3,7,0.359385,0.181049,0.551368,0.206386,0.280763,0.482076,0.506677,0.362793,0.379737,0.345686,0.445276,0.518485,0.299028,0.598166
4,10,0.335791,0.682607,0.676481,0.219465,0.282861,0.581721,0.748639,0.350158,0.448915,0.506878,0.817721,0.805895,0.790591,0.249275


In [4]:
df_sample = pd.read_csv('sample_submission_Play.csv')

In [5]:
train_id = df_train["id"]
test_id = df_test["id"]

df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

In [6]:
# df_train['contG'] = df_train['cont1'] * df_train['cont7']
# df_test['contG'] = df_test['cont1'] * df_test['cont7']

# df_train['contS'] = df_train['cont2'] * df_train['cont9']
# df_test['contS'] = df_test['cont2'] * df_test['cont9']

# df_train['contR'] = df_train['cont3'] * df_train['cont11']
# df_test['contR'] = df_test['cont3'] * df_test['cont11']

In [7]:
#Adding all features except Target column
feature_cols = [c for c in df_train.columns if c != "target"]

In [8]:
feature_cols

['cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14']

In [9]:
train_x = df_train[feature_cols]
train_y = df_train.target
test_x = df_test

In [10]:
folds = KFold(n_splits=11, shuffle=True, random_state=2021)

In [11]:
class FoldsAverageLGBM:
    def __init__(self, folds):
        self.folds = folds
        self.models = []
        
    def fit(self, lgb_params, train_x, train_y):
        oof_preds = np.zeros_like(train_y)
        
        self.train_x = train_x.values
        self.train_y = train_y.values
        
        for tr_idx, va_idx in tqdm(folds.split(train_x)):
            tr_x, va_x = self.train_x[tr_idx], self.train_x[va_idx]
            tr_y, va_y = self.train_y[tr_idx], self.train_y[va_idx]
            
            lgb_train_dataset = lgb.Dataset(tr_x, tr_y)
            lgb_valid_dataset = lgb.Dataset(va_x, va_y)
            model = lgb.train(lgb_params, lgb_train_dataset, valid_sets=[lgb_valid_dataset], verbose_eval=100)
            self.models.append(model)
            
            oof_pred = model.predict(va_x)
            oof_preds[va_idx] = oof_pred
            
        self.oof_preds = oof_preds
        
    def predict(self, test_x):
        preds = []
        for model in tqdm(self.models):
            pred = model.predict(test_x)
            preds.append(pred)
        preds = np.mean(preds, axis=0)
        return preds

In [12]:
best_lgb_params = {
 'seed': 2021,
 'objective': 'regression',
 'metric': 'rmse',
 'verbosity': -1,
 'feature_pre_filter': False,
 'lambda_l1': 6.540486456085813,
 'lambda_l2': 0.01548480538099245,
 'num_leaves': 256,
 'feature_fraction': 0.52,
 'bagging_fraction': 0.6161835249194311,
 'bagging_freq': 7,
 'min_child_samples': 20
}
best_lgb_params["learning_rate"] = 0.001
best_lgb_params["early_stopping_round"] = 1000
best_lgb_params["num_iterations"] = 18000

In [13]:
folds_average_lgbm = FoldsAverageLGBM(folds)

In [14]:
folds_average_lgbm.fit(best_lgb_params, train_x, train_y)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Training until validation scores don't improve for 1000 rounds
[100]	valid_0's rmse: 0.734943
[200]	valid_0's rmse: 0.732479
[300]	valid_0's rmse: 0.730237
[400]	valid_0's rmse: 0.728243
[500]	valid_0's rmse: 0.726425
[600]	valid_0's rmse: 0.724768
[700]	valid_0's rmse: 0.723263
[800]	valid_0's rmse: 0.721914
[900]	valid_0's rmse: 0.720666
[1000]	valid_0's rmse: 0.719504
[1100]	valid_0's rmse: 0.718442
[1200]	valid_0's rmse: 0.717479
[1300]	valid_0's rmse: 0.716576
[1400]	valid_0's rmse: 0.715749
[1500]	valid_0's rmse: 0.714982
[1600]	valid_0's rmse: 0.714278
[1700]	valid_0's rmse: 0.713606
[1800]	valid_0's rmse: 0.712996
[1900]	valid_0's rmse: 0.712431
[2000]	valid_0's rmse: 0.711893
[2100]	valid_0's rmse: 0.711384
[2200]	valid_0's rmse: 0.71092
[2300]	valid_0's rmse: 0.710481
[2400]	valid_0's rmse: 0.710066
[2500]	valid_0's rmse: 0.709672
[2600]	valid_0's rmse: 0.709294
[2700]	valid_0's rmse: 0.708933
[2800]	valid_0's rmse: 0.708601
[2900]	valid_0's rmse: 0.708286
[3000]	valid_0's rm

[7000]	valid_0's rmse: 0.693288
[7100]	valid_0's rmse: 0.693225
[7200]	valid_0's rmse: 0.693164
[7300]	valid_0's rmse: 0.693105
[7400]	valid_0's rmse: 0.693043
[7500]	valid_0's rmse: 0.692987
[7600]	valid_0's rmse: 0.692931
[7700]	valid_0's rmse: 0.692873
[7800]	valid_0's rmse: 0.692816
[7900]	valid_0's rmse: 0.692761
[8000]	valid_0's rmse: 0.692708
[8100]	valid_0's rmse: 0.692658
[8200]	valid_0's rmse: 0.692609
[8300]	valid_0's rmse: 0.69257
[8400]	valid_0's rmse: 0.692532
[8500]	valid_0's rmse: 0.692501
[8600]	valid_0's rmse: 0.692455
[8700]	valid_0's rmse: 0.692418
[8800]	valid_0's rmse: 0.692382
[8900]	valid_0's rmse: 0.692348
[9000]	valid_0's rmse: 0.692311
[9100]	valid_0's rmse: 0.692274
[9200]	valid_0's rmse: 0.692243
[9300]	valid_0's rmse: 0.692205
[9400]	valid_0's rmse: 0.692174
[9500]	valid_0's rmse: 0.692145
[9600]	valid_0's rmse: 0.692116
[9700]	valid_0's rmse: 0.69209
[9800]	valid_0's rmse: 0.692064
[9900]	valid_0's rmse: 0.692033
[10000]	valid_0's rmse: 0.692
[10100]	vali

[14000]	valid_0's rmse: 0.693246
[14100]	valid_0's rmse: 0.693241
[14200]	valid_0's rmse: 0.69324
[14300]	valid_0's rmse: 0.693237
[14400]	valid_0's rmse: 0.693231
[14500]	valid_0's rmse: 0.693225
[14600]	valid_0's rmse: 0.693221
[14700]	valid_0's rmse: 0.693211
[14800]	valid_0's rmse: 0.69321
[14900]	valid_0's rmse: 0.693206
[15000]	valid_0's rmse: 0.693206
[15100]	valid_0's rmse: 0.693198
[15200]	valid_0's rmse: 0.693194
[15300]	valid_0's rmse: 0.693187
[15400]	valid_0's rmse: 0.693186
[15500]	valid_0's rmse: 0.693187
[15600]	valid_0's rmse: 0.693189
[15700]	valid_0's rmse: 0.693183
[15800]	valid_0's rmse: 0.693185
[15900]	valid_0's rmse: 0.693185
[16000]	valid_0's rmse: 0.693185
[16100]	valid_0's rmse: 0.693187
[16200]	valid_0's rmse: 0.693184
[16300]	valid_0's rmse: 0.693181
[16400]	valid_0's rmse: 0.693181
[16500]	valid_0's rmse: 0.693183
[16600]	valid_0's rmse: 0.693179
[16700]	valid_0's rmse: 0.693175
[16800]	valid_0's rmse: 0.693179
[16900]	valid_0's rmse: 0.693181
[17000]	vali

[3700]	valid_0's rmse: 0.700742
[3800]	valid_0's rmse: 0.700518
[3900]	valid_0's rmse: 0.700292
[4000]	valid_0's rmse: 0.700078
[4100]	valid_0's rmse: 0.699882
[4200]	valid_0's rmse: 0.699694
[4300]	valid_0's rmse: 0.699499
[4400]	valid_0's rmse: 0.699319
[4500]	valid_0's rmse: 0.699142
[4600]	valid_0's rmse: 0.698981
[4700]	valid_0's rmse: 0.698825
[4800]	valid_0's rmse: 0.698677
[4900]	valid_0's rmse: 0.698527
[5000]	valid_0's rmse: 0.698385
[5100]	valid_0's rmse: 0.698253
[5200]	valid_0's rmse: 0.698125
[5300]	valid_0's rmse: 0.698006
[5400]	valid_0's rmse: 0.69788
[5500]	valid_0's rmse: 0.697761
[5600]	valid_0's rmse: 0.69766
[5700]	valid_0's rmse: 0.697548
[5800]	valid_0's rmse: 0.697441
[5900]	valid_0's rmse: 0.697342
[6000]	valid_0's rmse: 0.697256
[6100]	valid_0's rmse: 0.697166
[6200]	valid_0's rmse: 0.697065
[6300]	valid_0's rmse: 0.696973
[6400]	valid_0's rmse: 0.696885
[6500]	valid_0's rmse: 0.696808
[6600]	valid_0's rmse: 0.69673
[6700]	valid_0's rmse: 0.696649
[6800]	vali

[10800]	valid_0's rmse: 0.696679
[10900]	valid_0's rmse: 0.69666
[11000]	valid_0's rmse: 0.696637
[11100]	valid_0's rmse: 0.69662
[11200]	valid_0's rmse: 0.696596
[11300]	valid_0's rmse: 0.696574
[11400]	valid_0's rmse: 0.696559
[11500]	valid_0's rmse: 0.696541
[11600]	valid_0's rmse: 0.696526
[11700]	valid_0's rmse: 0.696508
[11800]	valid_0's rmse: 0.696487
[11900]	valid_0's rmse: 0.696473
[12000]	valid_0's rmse: 0.696458
[12100]	valid_0's rmse: 0.696442
[12200]	valid_0's rmse: 0.696425
[12300]	valid_0's rmse: 0.696403
[12400]	valid_0's rmse: 0.696387
[12500]	valid_0's rmse: 0.696367
[12600]	valid_0's rmse: 0.696357
[12700]	valid_0's rmse: 0.696346
[12800]	valid_0's rmse: 0.696324
[12900]	valid_0's rmse: 0.696306
[13000]	valid_0's rmse: 0.696291
[13100]	valid_0's rmse: 0.696281
[13200]	valid_0's rmse: 0.696267
[13300]	valid_0's rmse: 0.696252
[13400]	valid_0's rmse: 0.696239
[13500]	valid_0's rmse: 0.696231
[13600]	valid_0's rmse: 0.696224
[13700]	valid_0's rmse: 0.696211
[13800]	vali

Training until validation scores don't improve for 1000 rounds
[100]	valid_0's rmse: 0.734268
[200]	valid_0's rmse: 0.731724
[300]	valid_0's rmse: 0.729438
[400]	valid_0's rmse: 0.727391
[500]	valid_0's rmse: 0.725511
[600]	valid_0's rmse: 0.723796
[700]	valid_0's rmse: 0.722245
[800]	valid_0's rmse: 0.720843
[900]	valid_0's rmse: 0.71955
[1000]	valid_0's rmse: 0.718356
[1100]	valid_0's rmse: 0.717274
[1200]	valid_0's rmse: 0.716262
[1300]	valid_0's rmse: 0.715338
[1400]	valid_0's rmse: 0.714478
[1500]	valid_0's rmse: 0.713695
[1600]	valid_0's rmse: 0.712972
[1700]	valid_0's rmse: 0.712295
[1800]	valid_0's rmse: 0.711655
[1900]	valid_0's rmse: 0.711063
[2000]	valid_0's rmse: 0.710509
[2100]	valid_0's rmse: 0.709985
[2200]	valid_0's rmse: 0.709495
[2300]	valid_0's rmse: 0.709021
[2400]	valid_0's rmse: 0.708576
[2500]	valid_0's rmse: 0.708157
[2600]	valid_0's rmse: 0.707757
[2700]	valid_0's rmse: 0.707374
[2800]	valid_0's rmse: 0.707016
[2900]	valid_0's rmse: 0.706674
[3000]	valid_0's rm

[7000]	valid_0's rmse: 0.697625
[7100]	valid_0's rmse: 0.697572
[7200]	valid_0's rmse: 0.697509
[7300]	valid_0's rmse: 0.697457
[7400]	valid_0's rmse: 0.697409
[7500]	valid_0's rmse: 0.697356
[7600]	valid_0's rmse: 0.697307
[7700]	valid_0's rmse: 0.697259
[7800]	valid_0's rmse: 0.697214
[7900]	valid_0's rmse: 0.697171
[8000]	valid_0's rmse: 0.697121
[8100]	valid_0's rmse: 0.697082
[8200]	valid_0's rmse: 0.697046
[8300]	valid_0's rmse: 0.697009
[8400]	valid_0's rmse: 0.696973
[8500]	valid_0's rmse: 0.696936
[8600]	valid_0's rmse: 0.696901
[8700]	valid_0's rmse: 0.696866
[8800]	valid_0's rmse: 0.696829
[8900]	valid_0's rmse: 0.696801
[9000]	valid_0's rmse: 0.69677
[9100]	valid_0's rmse: 0.696739
[9200]	valid_0's rmse: 0.696711
[9300]	valid_0's rmse: 0.696686
[9400]	valid_0's rmse: 0.696661
[9500]	valid_0's rmse: 0.696638
[9600]	valid_0's rmse: 0.69661
[9700]	valid_0's rmse: 0.696581
[9800]	valid_0's rmse: 0.696555
[9900]	valid_0's rmse: 0.696532
[10000]	valid_0's rmse: 0.696509
[10100]	v

[14800]	valid_0's rmse: 0.694643
[14900]	valid_0's rmse: 0.694627
[15000]	valid_0's rmse: 0.694624
[15100]	valid_0's rmse: 0.69461
[15200]	valid_0's rmse: 0.694595
[15300]	valid_0's rmse: 0.694586
[15400]	valid_0's rmse: 0.694578
[15500]	valid_0's rmse: 0.69457
[15600]	valid_0's rmse: 0.694568
[15700]	valid_0's rmse: 0.694562
[15800]	valid_0's rmse: 0.69455
[15900]	valid_0's rmse: 0.694539
[16000]	valid_0's rmse: 0.694529
[16100]	valid_0's rmse: 0.694526
[16200]	valid_0's rmse: 0.694517
[16300]	valid_0's rmse: 0.694513
[16400]	valid_0's rmse: 0.694503
[16500]	valid_0's rmse: 0.694495
[16600]	valid_0's rmse: 0.694492
[16700]	valid_0's rmse: 0.694488
[16800]	valid_0's rmse: 0.694478
[16900]	valid_0's rmse: 0.694466
[17000]	valid_0's rmse: 0.694461
[17100]	valid_0's rmse: 0.694455
[17200]	valid_0's rmse: 0.694453
[17300]	valid_0's rmse: 0.694448
[17400]	valid_0's rmse: 0.694444
[17500]	valid_0's rmse: 0.694443
[17600]	valid_0's rmse: 0.69443
[17700]	valid_0's rmse: 0.694424
[17800]	valid_

In [15]:
np.sqrt(mean_squared_error(df_train.target, folds_average_lgbm.oof_preds))

0.6951243938402865

In [None]:
y_pred = folds_average_lgbm.predict(test_x)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

In [None]:
sub = df_sample.copy()
sub["target"] = y_pred

sub.to_csv("submission_lgbm_fold_11_HopefullyBest.csv", index=False)

sub.head(10)