In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, SCORERS
from xgboost import XGBRegressor,XGBClassifier
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from catboost import CatBoostRegressor
import pickle
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#data prep
trainset = pd.read_csv("/kaggle/input/mh-wipro-sustainable-ml-challenge/train.csv")
testset = pd.read_csv("/kaggle/input/mh-wipro-sustainable-ml-challenge/test.csv")

test = testset.drop(['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI'],axis = 1)

Y_cols = trainset.loc[:, ['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI']]
trainset = trainset.drop(['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI'], axis = 1)

train_dhi = Y_cols['Clearsky DHI']
train_dni = Y_cols['Clearsky DNI']
train_ghi = Y_cols['Clearsky GHI']

In [3]:
trainset.dtypes

Year                    int64
Month                   int64
Day                     int64
Hour                    int64
Minute                  int64
Cloud Type              int64
Dew Point             float64
Temperature           float64
Pressure                int64
Relative Humidity     float64
Solar Zenith Angle    float64
Precipitable Water    float64
Wind Direction        float64
Wind Speed            float64
Fill Flag               int64
dtype: object

In [4]:
train_dhi.dtypes

dtype('int64')

In [5]:
class Train:
    def __init__(self, data, target, save_name):
        super().__init__()
        self.data = data
        self.target = target
        self.save_name = save_name
        
    def objective(self, trial):
        param = {
            'verbosity': 1,
            'objective': 'reg:squarederror',
            'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log = True),
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),
            'eta': trial.suggest_float('eta', 1e-8, 1.0, log = True),
            'gamma': trial.suggest_float('gamma', 1e-8, 10.0, log = True),
                'max_depth': trial.suggest_int('max_depth', 5, 10),
            'eval_metric': 'rmse',
            'n_estimators': trial.suggest_int('n_estimators', 100, 200),
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 9e-1, log = True),
            'subsample': trial.suggest_float('subsample', 1e-5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 1e-5, 1)
        }
        model = XGBRegressor(tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",**param)
        val = cross_validate(model, self.data, self.target , scoring = 'neg_mean_squared_error', n_jobs = -1)
        loss = np.mean(val['test_score'])
        return loss

    def study(self):
        s = optuna.create_study(direction='maximize')
        s.optimize(self.objective, n_trials=8)
        return s.best_params

    def train_loop(self):
        kf = KFold(shuffle = True, random_state = 42)
        best_loss = 1e18
        oof_preds = pd.DataFrame(columns = ['tar'], index = trainset.index)

        params = self.study()
        model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor",**params)
        print("Training starts...")
        for fold, (train_idx, test_idx) in enumerate(kf.split(self.data, self.target)):
            print("fold_",fold," done !")
            xt = trainset.iloc[train_idx, :]
            yt = train_dhi.iloc[train_idx]
            xv = trainset.iloc[test_idx, :]
            yv = train_dhi.iloc[test_idx]
            model.fit(xt, yt)
            pred = model.predict(xv)
            loss = mean_squared_error(yv, pred)

            oof_preds.loc[test_idx,'tar'] = pred #properly keeping oof preditcions according to test idx

            filename = self.save_name+".json"
            if(loss<best_loss):
#                 model.save_model(self.save_name+".json")
                pickle.dump(model, open(filename, "wb"))
                best_loss = loss
        
        return oof_preds

In [6]:
train = Train(trainset, train_dhi, 'model_DHI_1')
pred_df = train.train_loop()

trainset['Clearsky DHI']  = pred_df['tar']
convert_dict = {"Clearsky DHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DHI_1",index=False)

[32m[I 2023-04-12 14:38:21,079][0m A new study created in memory with name: no-name-cfbee109-152e-403f-ace8-f9eaa91881e5[0m
[32m[I 2023-04-12 14:39:01,651][0m Trial 0 finished with value: -451.45316439907435 and parameters: {'booster': 'gbtree', 'lambda': 0.04963428514251636, 'alpha': 0.0016729162206134687, 'eta': 0.03411385299897043, 'gamma': 8.778462725252736e-07, 'max_depth': 9, 'n_estimators': 183, 'learning_rate': 0.5902054790496137, 'subsample': 0.9826100621520778, 'colsample_bytree': 0.6853282558656341}. Best is trial 0 with value: -451.45316439907435.[0m
[32m[I 2023-04-12 14:39:14,014][0m Trial 1 finished with value: -1450.9228880118103 and parameters: {'booster': 'gblinear', 'lambda': 9.805004475155398e-07, 'alpha': 4.00362329682322e-05, 'eta': 7.549602456960428e-05, 'gamma': 1.4714213796502855e-08, 'max_depth': 6, 'n_estimators': 158, 'learning_rate': 0.06623118772032435, 'subsample': 0.21241574536949145, 'colsample_bytree': 0.9425640425829985}. Best is trial 0 with v

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [7]:
trainset.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Clearsky DHI
0,2009,1,1,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0,-0.089561
1,2009,1,1,0,30,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0,-0.02662
2,2009,1,1,1,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0,0.154262
3,2009,1,1,1,30,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0,2.839067
4,2009,1,1,2,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0,0.73616


In [8]:
trainset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175296 entries, 0 to 175295
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Year                175296 non-null  int64  
 1   Month               175296 non-null  int64  
 2   Day                 175296 non-null  int64  
 3   Hour                175296 non-null  int64  
 4   Minute              175296 non-null  int64  
 5   Cloud Type          175296 non-null  int64  
 6   Dew Point           175296 non-null  float64
 7   Temperature         175296 non-null  float64
 8   Pressure            175296 non-null  int64  
 9   Relative Humidity   175296 non-null  float64
 10  Solar Zenith Angle  175296 non-null  float64
 11  Precipitable Water  175296 non-null  float64
 12  Wind Direction      175296 non-null  float64
 13  Wind Speed          175296 non-null  float64
 14  Fill Flag           175296 non-null  int64  
 15  Clearsky DHI        175296 non-nul

In [9]:
### trainset -> R+DHI
train = Train(trainset, train_ghi, 'model_GHI_1')
pred_df = train.train_loop()

trainset['Clearsky GHI']  = pred_df['tar']
convert_dict = {"Clearsky GHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_GHI_1",index=False)

[32m[I 2023-04-12 14:41:45,295][0m A new study created in memory with name: no-name-d8a67227-66b8-473f-a43d-433305c357f3[0m
[32m[I 2023-04-12 14:42:28,115][0m Trial 0 finished with value: -165863.35703820185 and parameters: {'booster': 'gbtree', 'lambda': 0.01927563686102126, 'alpha': 0.0010027484854845357, 'eta': 0.013062705614748222, 'gamma': 0.003395159801941992, 'max_depth': 9, 'n_estimators': 198, 'learning_rate': 8.966155001496446e-05, 'subsample': 0.8743731506582699, 'colsample_bytree': 0.7632741482270398}. Best is trial 0 with value: -165863.35703820185.[0m
[32m[I 2023-04-12 14:43:23,200][0m Trial 1 finished with value: -654.1563625156328 and parameters: {'booster': 'gbtree', 'lambda': 0.0014465031126322933, 'alpha': 0.302156554458262, 'eta': 0.4071506202095619, 'gamma': 0.0035888255619629114, 'max_depth': 10, 'n_estimators': 156, 'learning_rate': 0.03692498666328662, 'subsample': 0.20302847169343302, 'colsample_bytree': 0.3474825444834563}. Best is trial 1 with value: 

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [10]:
trainset.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Clearsky DHI,Clearsky GHI
0,2009,1,1,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0,-0.089561,0.00178
1,2009,1,1,0,30,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0,-0.02662,0.111337
2,2009,1,1,1,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0,0.154262,-0.00206
3,2009,1,1,1,30,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0,2.839067,-0.045307
4,2009,1,1,2,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0,0.73616,-0.320229


In [11]:
### trainset -> R+DHI+GHI
train = Train(trainset, train_dni, 'model_DNI_1')
pred_df = train.train_loop()

trainset['Clearsky DNI']  = pred_df['tar']
convert_dict = {"Clearsky DNI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DNI_1",index=False)

[32m[I 2023-04-12 14:48:29,943][0m A new study created in memory with name: no-name-be1662c6-d127-48ba-b3a8-cd7cb48e06c8[0m
[32m[I 2023-04-12 14:49:06,012][0m Trial 0 finished with value: -205309.74846057166 and parameters: {'booster': 'dart', 'lambda': 0.002286868760881918, 'alpha': 0.02276184766797106, 'eta': 8.194978887106593e-08, 'gamma': 0.05717420911022738, 'max_depth': 8, 'n_estimators': 106, 'learning_rate': 0.0009785302724657277, 'subsample': 0.23452414352504647, 'colsample_bytree': 0.244081834007176}. Best is trial 0 with value: -205309.74846057166.[0m
[32m[I 2023-04-12 14:50:08,412][0m Trial 1 finished with value: -235315.9208585259 and parameters: {'booster': 'dart', 'lambda': 0.09454359216886249, 'alpha': 0.0005142653081430859, 'eta': 0.0003411046218554219, 'gamma': 0.02775492445852555, 'max_depth': 7, 'n_estimators': 155, 'learning_rate': 0.00014187626261200135, 'subsample': 0.7427221921070697, 'colsample_bytree': 0.46732249879248206}. Best is trial 0 with value: 

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [12]:
trainset = trainset.drop(['Clearsky DHI'], axis = 1)
### trainset -> R+GHI+DNI
train = Train(trainset, train_dhi, 'model_DHI_2')
pred_df = train.train_loop()

trainset['Clearsky DHI']  = pred_df['tar']
convert_dict = {"Clearsky DHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DHI_2",index=False)

[32m[I 2023-04-12 14:54:42,294][0m A new study created in memory with name: no-name-7c4cac69-3248-4aaf-b779-e6c8b1cfaf80[0m


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistake

[32m[I 2023-04-12 14:54:54,969][0m Trial 0 finished with value: -718.1727288376769 and parameters: {'booster': 'gblinear', 'lambda': 0.09582774137922452, 'alpha': 0.5509458853728786, 'eta': 0.035116302003519013, 'gamma': 8.184723658278211e-07, 'max_depth': 10, 'n_estimators': 143, 'learning_rate': 0.004513368426286492, 'subsample': 0.5011867993252058, 'colsample_bytree': 0.44752343717226256}. Best is trial 0 with value: -718.1727288376769.[0m
[32m[I 2023-04-12 14:56:12,591][0m Trial 1 finished with value: -6709.119975650765 and parameters: {'booster': 'dart', 'lambda': 0.2293655812450268, 'alpha': 5.210666485568893e-06, 'eta': 0.03702580624828988, 'gamma': 2.230735764796613e-08, 'max_depth': 5, 'n_estimators': 185, 'learning_rate': 7.519314698827544e-05, 'subsample': 0.6098386649569929, 'colsample_bytree': 0.38975342772425114}. Best is trial 0 with value: -718.1727288376769.[0m
[32m[I 2023-04-12 14:56:29,151][0m Trial 2 finished with value: -1495.5094005407068 and parameters: {

Training starts...
fold_ 0  done !
Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


fold_ 1  done !
Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


fold_ 2  done !
Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some pa

In [13]:
trainset = trainset.drop(['Clearsky GHI'], axis = 1)
### trainset -> R+DHI+DNI
train = Train(trainset, train_ghi, 'model_GHI_2')
pred_df = train.train_loop()

trainset['Clearsky GHI']  = pred_df['tar']
convert_dict = {"Clearsky GHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_GHI_2",index=False)

[32m[I 2023-04-12 14:58:52,186][0m A new study created in memory with name: no-name-83fa3b06-2a59-43ce-8a9f-3c7a3f5e4af1[0m
[32m[I 2023-04-12 15:00:05,824][0m Trial 0 finished with value: -69.61190069434764 and parameters: {'booster': 'dart', 'lambda': 0.09610749263322142, 'alpha': 0.7749943320238222, 'eta': 0.003869409109301468, 'gamma': 1.1434942752668794e-08, 'max_depth': 8, 'n_estimators': 163, 'learning_rate': 0.4044220498966254, 'subsample': 0.911547202164159, 'colsample_bytree': 0.6055350280553088}. Best is trial 0 with value: -69.61190069434764.[0m
[32m[I 2023-04-12 15:00:42,932][0m Trial 1 finished with value: -111.79171907145053 and parameters: {'booster': 'dart', 'lambda': 0.12139804795742189, 'alpha': 0.0009974262408564796, 'eta': 0.00010005025550394787, 'gamma': 0.009371878445425098, 'max_depth': 6, 'n_estimators': 123, 'learning_rate': 0.06672078441641759, 'subsample': 0.8009826546437288, 'colsample_bytree': 0.4219984276793158}. Best is trial 0 with value: -69.611

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [14]:
trainset = trainset.drop(['Clearsky DNI'], axis = 1)
### trainset -> R+DHI+GHI
train = Train(trainset, train_dni, 'model_DNI_2')
pred_df = train.train_loop()

trainset['Clearsky DNI']  = pred_df['tar']
convert_dict = {"Clearsky DNI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DNI_2",index=False)

[32m[I 2023-04-12 15:07:53,244][0m A new study created in memory with name: no-name-e979c3e2-46d6-4065-a411-0f4eea174121[0m
[32m[I 2023-04-12 15:08:00,662][0m Trial 0 finished with value: -955.7183088759908 and parameters: {'booster': 'gbtree', 'lambda': 1.0616672661795683e-05, 'alpha': 0.0035816041364473343, 'eta': 0.4664903479570602, 'gamma': 0.020082850000645226, 'max_depth': 7, 'n_estimators': 119, 'learning_rate': 0.09970786552010218, 'subsample': 0.4269515497959054, 'colsample_bytree': 0.8183051714692066}. Best is trial 0 with value: -955.7183088759908.[0m
[32m[I 2023-04-12 15:08:12,375][0m Trial 1 finished with value: -18085.335421893 and parameters: {'booster': 'gblinear', 'lambda': 0.0011178694946209252, 'alpha': 1.2318079154389973e-06, 'eta': 0.03723925832322917, 'gamma': 7.408671748862204e-07, 'max_depth': 9, 'n_estimators': 132, 'learning_rate': 0.269213830677103, 'subsample': 0.6958557051079624, 'colsample_bytree': 0.8887906242692927}. Best is trial 0 with value: -


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistak

[32m[I 2023-04-12 15:10:27,661][0m Trial 5 finished with value: -38155.12533035218 and parameters: {'booster': 'gblinear', 'lambda': 1.1288610696400487e-05, 'alpha': 0.00010913045979237252, 'eta': 6.868043720054991e-08, 'gamma': 3.7374934234345536e-05, 'max_depth': 5, 'n_estimators': 149, 'learning_rate': 0.0044396359359289575, 'subsample': 0.6988640135609904, 'colsample_bytree': 0.6447821221925454}. Best is trial 0 with value: -955.7183088759908.[0m
[32m[I 2023-04-12 15:11:01,047][0m Trial 6 finished with value: -242186.62094303803 and parameters: {'booster': 'gbtree', 'lambda': 1.1940163820791017e-08, 'alpha': 0.005047326009898305, 'eta': 0.2028793860323499, 'gamma': 0.05738126503405069, 'max_depth': 10, 'n_estimators': 153, 'learning_rate': 6.392111938377506e-05, 'subsample': 0.3413082681510625, 'colsample_bytree': 0.12363647751998515}. Best is trial 0 with value: -955.7183088759908.[0m
[32m[I 2023-04-12 15:11:10,510][0m Trial 7 finished with value: -17916.138999820014 and p

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [15]:
trainset = trainset.drop(['Clearsky DHI'], axis = 1)
### trainset -> R+GHI+DNI
train = Train(trainset, train_dhi, 'model_DHI_3')
pred_df = train.train_loop()

trainset['Clearsky DHI']  = pred_df['tar']
convert_dict = {"Clearsky DHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DHI_3",index=False)

[32m[I 2023-04-12 15:11:15,400][0m A new study created in memory with name: no-name-4bc86a8a-7815-496f-be8b-985e30c4d95c[0m
[32m[I 2023-04-12 15:12:59,152][0m Trial 0 finished with value: -6761.624820359255 and parameters: {'booster': 'dart', 'lambda': 1.0028716686948123e-06, 'alpha': 0.07978480016273361, 'eta': 0.26274640114447206, 'gamma': 6.5323935003084e-07, 'max_depth': 9, 'n_estimators': 176, 'learning_rate': 5.229310716390823e-05, 'subsample': 0.9012597599632457, 'colsample_bytree': 0.9429471409894913}. Best is trial 0 with value: -6761.624820359255.[0m
[32m[I 2023-04-12 15:14:03,395][0m Trial 1 finished with value: -4142.102903554947 and parameters: {'booster': 'dart', 'lambda': 4.796092733577649e-07, 'alpha': 0.002712144551292186, 'eta': 0.0327073268255046, 'gamma': 1.2167069788549026e-05, 'max_depth': 8, 'n_estimators': 161, 'learning_rate': 0.0032347031820623624, 'subsample': 0.7020679914622521, 'colsample_bytree': 0.08751573087234038}. Best is trial 1 with value: -4


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistak

[32m[I 2023-04-12 15:14:13,712][0m Trial 2 finished with value: -101.11992530731318 and parameters: {'booster': 'gblinear', 'lambda': 0.3382976121828497, 'alpha': 3.496384279736765e-05, 'eta': 9.344829193132588e-07, 'gamma': 0.00053383563178997, 'max_depth': 6, 'n_estimators': 118, 'learning_rate': 0.6152228708612097, 'subsample': 0.2899027647551453, 'colsample_bytree': 0.7170146164739789}. Best is trial 2 with value: -101.11992530731318.[0m
[32m[I 2023-04-12 15:14:30,228][0m Trial 3 finished with value: -245.7077141722296 and parameters: {'booster': 'gblinear', 'lambda': 0.002801653149631, 'alpha': 6.28211887503714e-07, 'eta': 0.00019503768384429918, 'gamma': 4.8956756275431735e-05, 'max_depth': 6, 'n_estimators': 195, 'learning_rate': 0.00663970330006605, 'subsample': 0.7277058932713151, 'colsample_bytree': 0.35479750638639007}. Best is trial 2 with value: -101.11992530731318.[0m
[32m[I 2023-04-12 15:15:24,380][0m Trial 4 finished with value: -6097.819935075784 and parameters

Training starts...
fold_ 0  done !
Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


fold_ 1  done !
Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


fold_ 2  done !
Parameters: { "colsample_bytree", "gamma", "max_depth", "predictor", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some pa

In [16]:
trainset = trainset.drop(['Clearsky GHI'], axis = 1)
### trainset -> R+DHI+DNI
train = Train(trainset, train_ghi, 'model_GHI_3')
pred_df = train.train_loop()

trainset['Clearsky GHI']  = pred_df['tar']
convert_dict = {"Clearsky GHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_GHI_3",index=False)

[32m[I 2023-04-12 15:17:06,786][0m A new study created in memory with name: no-name-47b986df-c78d-4d2f-b9de-080c71fa9343[0m
[32m[I 2023-04-12 15:17:19,423][0m Trial 0 finished with value: -25107.325957534144 and parameters: {'booster': 'gblinear', 'lambda': 3.5972336409974322e-06, 'alpha': 0.001465463536995171, 'eta': 3.00181490996751e-07, 'gamma': 0.008419641880083121, 'max_depth': 6, 'n_estimators': 136, 'learning_rate': 0.049072402731112436, 'subsample': 0.11098389400451483, 'colsample_bytree': 0.38874256946219427}. Best is trial 0 with value: -25107.325957534144.[0m
[32m[I 2023-04-12 15:17:49,159][0m Trial 1 finished with value: -773.0514653487398 and parameters: {'booster': 'gbtree', 'lambda': 0.03289301013633662, 'alpha': 0.005379768247142283, 'eta': 2.3611285623069776e-06, 'gamma': 0.00045651793869910855, 'max_depth': 9, 'n_estimators': 162, 'learning_rate': 0.33576601901105807, 'subsample': 0.670383939252285, 'colsample_bytree': 0.45016346125709295}. Best is trial 1 wit

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [17]:
trainset = trainset.drop(['Clearsky DNI'], axis = 1)
### trainset -> R+DHI+GHI
train = Train(trainset, train_dni, 'model_DNI_3')
pred_df = train.train_loop()

trainset['Clearsky DNI']  = pred_df['tar']
convert_dict = {"Clearsky DNI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DNI_3",index=False)

[32m[I 2023-04-12 15:19:09,813][0m A new study created in memory with name: no-name-c6c829ae-6821-4569-a2f5-6bdda2e4a21f[0m
[32m[I 2023-04-12 15:19:41,224][0m Trial 0 finished with value: -2402.220949915069 and parameters: {'booster': 'gbtree', 'lambda': 1.1796685892433106e-08, 'alpha': 0.0012953407157937087, 'eta': 5.6764390891731574e-08, 'gamma': 6.6424656505621496e-06, 'max_depth': 10, 'n_estimators': 111, 'learning_rate': 0.47236325937711365, 'subsample': 0.4885335344541809, 'colsample_bytree': 0.5661775176409498}. Best is trial 0 with value: -2402.220949915069.[0m
[32m[I 2023-04-12 15:19:57,360][0m Trial 1 finished with value: -121619.26023947392 and parameters: {'booster': 'gblinear', 'lambda': 6.7773074957478e-07, 'alpha': 0.13779478544664447, 'eta': 0.002660362952624295, 'gamma': 3.0583162533574746e-06, 'max_depth': 7, 'n_estimators': 181, 'learning_rate': 0.0003904737496215719, 'subsample': 0.29231672101630296, 'colsample_bytree': 0.4305900316203956}. Best is trial 0 w

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !
