In [1]:
import lightgbm as lgb
import optuna.integration.lightgbm as oplgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,target
0,1,0.67039,0.8113,0.643968,0.291791,0.284117,0.855953,0.8907,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411,7.243043
1,3,0.388053,0.621104,0.686102,0.501149,0.64379,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484,8.203331
2,4,0.83495,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047,7.776091
3,5,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528,6.957716
4,8,0.935278,0.421235,0.303801,0.880214,0.66561,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772,7.951046


In [3]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,0,0.3536,0.73878,0.600939,0.293377,0.285691,0.458006,0.620704,0.422249,0.369203,0.435727,0.55054,0.699134,0.286864,0.364515
1,2,0.907222,0.189756,0.215531,0.869915,0.301333,0.528958,0.390351,0.521112,0.794779,0.79858,0.446475,0.449037,0.916964,0.513002
2,6,0.179287,0.355353,0.623972,0.437812,0.282476,0.320826,0.386789,0.776422,0.222268,0.229102,0.211913,0.222651,0.327164,0.827941
3,7,0.359385,0.181049,0.551368,0.206386,0.280763,0.482076,0.506677,0.362793,0.379737,0.345686,0.445276,0.518485,0.299028,0.598166
4,10,0.335791,0.682607,0.676481,0.219465,0.282861,0.581721,0.748639,0.350158,0.448915,0.506878,0.817721,0.805895,0.790591,0.249275


In [4]:
df_sample = pd.read_csv('sample_submission_Play.csv')

In [5]:
train_id = df_train["id"]
test_id = df_test["id"]

df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

In [6]:
# df_train['contG'] = df_train['cont1'] * df_train['cont7']
# df_test['contG'] = df_test['cont1'] * df_test['cont7']

# df_train['contS'] = df_train['cont2'] * df_train['cont9']
# df_test['contS'] = df_test['cont2'] * df_test['cont9']

# df_train['contR'] = df_train['cont3'] * df_train['cont11']
# df_test['contR'] = df_test['cont3'] * df_test['cont11']

In [7]:
#Adding all features except Target column
feature_cols = [c for c in df_train.columns if c != "target"]

In [8]:
feature_cols

['cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14']

In [9]:
train_x = df_train[feature_cols]
train_y = df_train.target
test_x = df_test

In [20]:
folds = KFold(n_splits=10, shuffle=True, random_state=33)

In [21]:
class FoldsAverageLGBM:
    def __init__(self, folds):
        self.folds = folds
        self.models = []
        
    def fit(self, lgb_params, train_x, train_y):
        oof_preds = np.zeros_like(train_y)
        
        self.train_x = train_x.values
        self.train_y = train_y.values
        
        for tr_idx, va_idx in tqdm(folds.split(train_x)):
            tr_x, va_x = self.train_x[tr_idx], self.train_x[va_idx]
            tr_y, va_y = self.train_y[tr_idx], self.train_y[va_idx]
            
            lgb_train_dataset = lgb.Dataset(tr_x, tr_y)
            lgb_valid_dataset = lgb.Dataset(va_x, va_y)
            model = lgb.train(lgb_params, lgb_train_dataset, valid_sets=[lgb_valid_dataset], verbose_eval=100)
            self.models.append(model)
            
            oof_pred = model.predict(va_x)
            oof_preds[va_idx] = oof_pred
            
        self.oof_preds = oof_preds
        
    def predict(self, test_x):
        preds = []
        for model in tqdm(self.models):
            pred = model.predict(test_x)
            preds.append(pred)
        preds = np.mean(preds, axis=0)
        return preds

In [12]:
best_lgb_params = {
 'random_state': 33,'n_estimators':5000,
 'min_data_per_group': 5,
 'boosting_type': 'gbdt',
 'num_leaves': 256,
 'num_iterations' : 5000,
 'max_dept': -1,
 'learning_rate': 0.0046,
 'subsample_for_bin': 200000,
 'lambda_l1': 1.074622455507616e-05,
 'lambda_l2': 2.0521330798729704e-06,
 'n_jobs': -1,
 'cat_smooth': 1.0,
 'silent': True,
 'importance_type': 'split',
 'metric': 'rmse',
 'feature_pre_filter': False,
 'bagging_fraction': 0.8206341150202605,
 'min_data_in_leaf': 100,
 'min_sum_hessian_in_leaf': 0.001,
 'bagging_freq': 6,
 'feature_fraction': 0.5,
 'min_gain_to_split': 0.0,
 'min_child_samples': 20
}
#best_lgb_params["learning_rate"] = 0.001
best_lgb_params["early_stopping_round"] = 1000
best_lgb_params["num_iterations"] = 18000

In [13]:
folds_average_lgbm = FoldsAverageLGBM(folds)

In [14]:
folds_average_lgbm.fit(best_lgb_params, train_x, train_y)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 1000 rounds
[100]	valid_0's rmse: 0.719856
[200]	valid_0's rmse: 0.712646
[300]	valid_0's rmse: 0.707904
[400]	valid_0's rmse: 0.704813
[500]	valid_0's rmse: 0.702675
[600]	valid_0's rmse: 0.701053
[700]	valid_0's rmse: 0.699775
[800]	valid_0's rmse: 0.698808
[900]	valid_0's rmse: 0.697955
[1000]	valid_0's rmse: 0.697326
[1100]	valid_0's rmse: 0.69678
[1200]	valid_0's rmse: 0.696334
[1300]	valid_0's rmse: 0.696002
[1400]	valid_0's rmse: 0.695732
[1500]	valid_0's rmse: 0.695508
[1600]	valid_0's rmse: 0.69529
[1700]	valid_0's rmse: 0.69514
[1800]	valid_0's rmse: 0.694988
[1900]	valid_0's rmse: 0.69488
[2000]	valid_0's rmse: 0.694783
[2100]	valid_0's rmse: 0.694694
[2200]	valid_0's rmse: 0.694635
[2300]	valid_0's rmse: 0.694553
[2400]	valid_0's rmse: 0.6945
[2500]	valid_0's rmse: 0.694455
[2600]	valid_0's rmse: 0.694399
[2700]	valid_0's rmse: 0.694356
[2800]	valid_0's rmse: 0.694349
[2900]	valid_0's rmse: 0.694315
[3000]	valid_0's rmse: 0

[500]	valid_0's rmse: 0.705053
[600]	valid_0's rmse: 0.703625
[700]	valid_0's rmse: 0.702452
[800]	valid_0's rmse: 0.701548
[900]	valid_0's rmse: 0.700836
[1000]	valid_0's rmse: 0.700274
[1100]	valid_0's rmse: 0.699798
[1200]	valid_0's rmse: 0.699432
[1300]	valid_0's rmse: 0.699108
[1400]	valid_0's rmse: 0.698876
[1500]	valid_0's rmse: 0.698647
[1600]	valid_0's rmse: 0.69847
[1700]	valid_0's rmse: 0.698303
[1800]	valid_0's rmse: 0.698172
[1900]	valid_0's rmse: 0.698057
[2000]	valid_0's rmse: 0.697986
[2100]	valid_0's rmse: 0.69793
[2200]	valid_0's rmse: 0.697904
[2300]	valid_0's rmse: 0.697844
[2400]	valid_0's rmse: 0.697803
[2500]	valid_0's rmse: 0.697742
[2600]	valid_0's rmse: 0.697713
[2700]	valid_0's rmse: 0.697691
[2800]	valid_0's rmse: 0.697657
[2900]	valid_0's rmse: 0.697633
[3000]	valid_0's rmse: 0.697633
[3100]	valid_0's rmse: 0.697629
[3200]	valid_0's rmse: 0.697636
[3300]	valid_0's rmse: 0.697617
[3400]	valid_0's rmse: 0.697575
[3500]	valid_0's rmse: 0.69756
[3600]	valid_0's

In [15]:
np.sqrt(mean_squared_error(df_train.target, folds_average_lgbm.oof_preds))

0.6955880415125976

In [16]:
# y_pred = folds_average_lgbm.predict(test_x)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [19]:
sub = df_sample.copy()
sub["target"] = y_pred

sub.to_csv("C:/Users/gaurav sahani/Desktop/Kaggle Playgroud Series/submission_lgbm_fold_8_25_01.csv", index=False)

sub.head(10)

Unnamed: 0,id,target
0,0,7.957645
1,2,7.877215
2,6,7.933945
3,7,8.27454
4,10,8.115473
5,14,7.915965
6,16,8.272832
7,17,7.73147
8,18,7.85036
9,19,7.723274
