In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, SCORERS
from xgboost import XGBRegressor,XGBClassifier
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
import pickle
from glob import glob
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#data prep
trainset = pd.read_csv("/kaggle/input/mh-wipro-sustainable-ml-challenge/train.csv")
testset = pd.read_csv("/kaggle/input/mh-wipro-sustainable-ml-challenge/test.csv")

testset = testset.drop(['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI'],axis = 1)

Y_cols = trainset.loc[:, ['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI']]
trainset = trainset.drop(['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI'], axis = 1)

train_dhi = Y_cols['Clearsky DHI']
train_dni = Y_cols['Clearsky DNI']
train_ghi = Y_cols['Clearsky GHI']

# Preparing Data for metamodel

In [3]:
# model = pickle.load(open("/kaggle/input/wipro-2-linear-reg/model_DHI_1.json",'rb'))

In [4]:
# model.predict(testset)

In [5]:
def prepare_stack_data(dirc, model_ID , lab, itr, stack_train, stack_test, testset):
    model_path = dirc+f"model_{lab}_{itr}.json"
    model = pickle.load(open(model_path, "rb"))
    preds = model.predict(testset)
    stack_test[(model_ID+"_"+lab+str(itr))]=preds
    convert_dict = {(model_ID+"_"+lab+str(itr)):float}
    stack_test = stack_test.astype(convert_dict)
    
    oof_path = dirc+f"oof_pred_{lab}_{itr}"
    try:
        oof_df=pd.read_csv(oof_path)
    except:
        oof_df=pd.read_csv(oof_path+".csv")
    stack_train[(model_ID+"_"+lab+str(itr))] = oof_df["tar"]
    convert_dict = {(model_ID+"_"+lab+str(itr)):float}
    stack_train = stack_train.astype(convert_dict)
    
    testset[f"Clearsky {lab}"] = preds
    convert_dict = {f"Clearsky {lab}":float}
    testset = testset.astype(convert_dict)
    
    return stack_train, stack_test, testset

In [6]:
stack_train_DHI = pd.DataFrame()
stack_test_DHI = pd.DataFrame()
stack_train_GHI = pd.DataFrame()
stack_test_GHI = pd.DataFrame()
stack_train_DNI = pd.DataFrame()
stack_test_DNI = pd.DataFrame()

In [7]:
dirc = "/kaggle/input/wipro-2-xgboost/"
model_ID = "XGB"
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc ,model_ID, "DHI", 1, stack_train_DHI, stack_test_DHI, testset)

stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 1, stack_train_GHI, stack_test_GHI,testset)

stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 1, stack_train_DNI, stack_test_DNI,testset)

testset = testset.drop(["Clearsky DHI"],axis =1)
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model_ID , "DHI", 2, stack_train_DHI, stack_test_DHI,testset)

testset = testset.drop(["Clearsky GHI"],axis =1)
stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 2, stack_train_GHI, stack_test_GHI,testset)

testset = testset.drop(["Clearsky DNI"],axis =1)
stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 2, stack_train_DNI, stack_test_DNI,testset)

testset = testset.drop(["Clearsky DHI"],axis =1)
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model_ID , "DHI", 3, stack_train_DHI, stack_test_DHI,testset)

testset = testset.drop(["Clearsky GHI"],axis =1)
stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 3, stack_train_GHI, stack_test_GHI,testset)

testset = testset.drop(["Clearsky DNI"],axis =1)
stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 3, stack_train_DNI, stack_test_DNI,testset)

In [8]:
testset = testset.drop(["Clearsky DHI"],axis =1)
testset = testset.drop(["Clearsky GHI"],axis =1)
testset = testset.drop(["Clearsky DNI"],axis =1)

In [9]:
dirc = "/kaggle/input/wipro-2-linear-reg/"
model_ID = "LinReg"
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc ,model_ID, "DHI", 1, stack_train_DHI, stack_test_DHI, testset)

stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 1, stack_train_GHI, stack_test_GHI,testset)

stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 1, stack_train_DNI, stack_test_DNI,testset)

testset = testset.drop(["Clearsky DHI"],axis =1)
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model_ID , "DHI", 2, stack_train_DHI, stack_test_DHI,testset)

testset = testset.drop(["Clearsky GHI"],axis =1)
stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 2, stack_train_GHI, stack_test_GHI,testset)

testset = testset.drop(["Clearsky DNI"],axis =1)
stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 2, stack_train_DNI, stack_test_DNI,testset)

testset = testset.drop(["Clearsky DHI"],axis =1)
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model_ID , "DHI", 3, stack_train_DHI, stack_test_DHI,testset)

testset = testset.drop(["Clearsky GHI"],axis =1)
stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 3, stack_train_GHI, stack_test_GHI,testset)

testset = testset.drop(["Clearsky DNI"],axis =1)
stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 3, stack_train_DNI, stack_test_DNI,testset)

In [10]:
testset = testset.drop(["Clearsky DHI"],axis =1)
testset = testset.drop(["Clearsky GHI"],axis =1)
testset = testset.drop(["Clearsky DNI"],axis =1)

In [11]:
dirc = "/kaggle/input/wipro-2-random-forest/"
model_ID = "RF"
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc ,model_ID, "DHI", 1, stack_train_DHI, stack_test_DHI, testset)

stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 1, stack_train_GHI, stack_test_GHI,testset)

stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 1, stack_train_DNI, stack_test_DNI,testset)

testset = testset.drop(["Clearsky DHI"],axis =1)
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model_ID , "DHI", 2, stack_train_DHI, stack_test_DHI,testset)

testset = testset.drop(["Clearsky GHI"],axis =1)
stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 2, stack_train_GHI, stack_test_GHI,testset)

testset = testset.drop(["Clearsky DNI"],axis =1)
stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 2, stack_train_DNI, stack_test_DNI,testset)

testset = testset.drop(["Clearsky DHI"],axis =1)
stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model_ID , "DHI", 3, stack_train_DHI, stack_test_DHI,testset)

testset = testset.drop(["Clearsky GHI"],axis =1)
stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model_ID , "GHI", 3, stack_train_GHI, stack_test_GHI,testset)

testset = testset.drop(["Clearsky DNI"],axis =1)
stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model_ID , "DNI", 3, stack_train_DNI, stack_test_DNI,testset)

In [12]:
# dirc = "/kaggle/input/wipro-2/"
# model_ID = "Lin_Reg"
# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model ,model_ID, "DHI", 1, stack_train_DHI, stack_test_DHI, testset)

# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model,model_ID , "GHI", 1, stack_train_GHI, stack_test_GHI,testset)

# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model,model_ID , "DNI", 1, stack_train_DNI, stack_test_DNI,testset)

# testset = testset.drop(["Clearsky DHI"],axis =1)
# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model,model_ID , "DHI", 2, stack_train_DHI, stack_test_DHI,testset)

# testset = testset.drop(["Clearsky GHI"],axis =1)
# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model,model_ID , "GHI", 2, stack_train_GHI, stack_test_GHI,testset)

# testset = testset.drop(["Clearsky DNI"],axis =1)
# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model,model_ID , "DNI", 2, stack_train_DNI, stack_test_DNI,testset)

# testset = testset.drop(["Clearsky DHI"],axis =1)
# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_DHI,stack_test_DHI,testset = prepare_stack_data(dirc,model,model_ID , "DHI", 3, stack_train_DHI, stack_test_DHI,testset)

# testset = testset.drop(["Clearsky GHI"],axis =1)
# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_GHI,stack_test_GHI,testset = prepare_stack_data(dirc,model,model_ID , "GHI", 3, stack_train_GHI, stack_test_GHI,testset)

# testset = testset.drop(["Clearsky DNI"],axis =1)
# model = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, predictor = "gpu_predictor")
# stack_train_DNI,stack_test_DNI,testset = prepare_stack_data(dirc,model,model_ID , "DNI", 3, stack_train_DNI, stack_test_DNI,testset)

In [13]:
stack_train_DHI.head()

Unnamed: 0,XGB_DHI1,XGB_DHI2,XGB_DHI3,LinReg_DHI1,LinReg_DHI2,LinReg_DHI3,RF_DHI1,RF_DHI2,RF_DHI3
0,-0.089561,-3.9644,0.128305,23.50655,23.264028,23.258006,0.0,0.0,0.0
1,-0.02662,-4.822152,0.010157,15.291682,15.919437,15.880744,0.0,0.0,0.0
2,0.154262,-4.616228,-0.094219,13.819152,14.041488,14.030194,0.0,0.0,0.0
3,2.839067,-5.868361,-0.084351,6.928181,6.717554,6.726092,0.0,0.0,0.0
4,0.73616,-6.009462,0.27856,1.305088,1.757409,1.75215,0.0,0.0,0.0


In [14]:
stack_test_DHI.head()

Unnamed: 0,XGB_DHI1,XGB_DHI2,XGB_DHI3,LinReg_DHI1,LinReg_DHI2,LinReg_DHI3,RF_DHI1,RF_DHI2,RF_DHI3
0,-4.045102,1.084949,1.039269,25.418788,24.77156,24.774456,0.0,0.0,0.0
1,-3.95762,0.672973,0.741555,16.350404,15.5822,15.575658,0.0,0.0,0.0
2,-3.739697,1.138231,0.598435,11.983752,11.375453,11.372709,0.0,0.0,0.0
3,-4.605823,0.809444,0.310337,5.848397,5.091357,5.084863,0.0,0.0,0.0
4,-4.710551,0.888854,0.018738,-1.622222,-2.227255,-2.234341,0.0,0.0,0.0


In [15]:
stack_train_GHI.head()

Unnamed: 0,XGB_GHI1,XGB_GHI2,XGB_GHI3,LinReg_GHI1,LinReg_GHI2,LinReg_GHI3,RF_GHI1,RF_GHI2,RF_GHI3
0,0.00178,-0.553557,-0.073791,23.267277,23.257899,23.259674,0.0,0.0,0.0
1,0.111337,-0.082075,0.088011,15.875487,15.892967,15.868999,0.0,0.0,0.0
2,-0.00206,0.016008,0.048992,14.030578,14.034217,14.026379,0.0,0.0,0.0
3,-0.045307,-0.175781,-0.002855,6.712169,6.727105,6.724014,0.0,0.0,0.0
4,-0.320229,0.188091,0.023539,1.766179,1.750169,1.753924,0.0,0.0,0.0


In [16]:
stack_test_GHI.head()

Unnamed: 0,XGB_GHI1,XGB_GHI2,XGB_GHI3,LinReg_GHI1,LinReg_GHI2,LinReg_GHI3,RF_GHI1,RF_GHI2,RF_GHI3
0,0.355365,0.674272,-0.069084,24.756206,24.77849,24.777716,0.0,0.0,0.0
1,0.118394,0.603244,-0.045682,15.572785,15.579769,15.579046,0.0,0.0,0.0
2,-0.312485,0.589465,-0.045682,11.368717,11.375295,11.374966,0.0,0.0,0.0
3,-0.271834,0.491726,-0.045682,5.082207,5.088722,5.087388,0.0,0.0,0.0
4,-0.581673,0.302241,0.012626,-2.228793,-2.232464,-2.232713,0.0,0.0,0.0


In [17]:
stack_train_DNI.head()

Unnamed: 0,XGB_DNI1,XGB_DNI2,XGB_DNI3,LinReg_DNI1,LinReg_DNI2,LinReg_DNI3,RF_DNI1,RF_DNI2,RF_DNI3
0,0.070526,0.085932,0.009572,23.260558,23.258859,23.256621,0.0,0.0,0.0
1,0.036012,-0.043518,0.014392,15.902838,15.901743,15.88732,0.0,0.0,0.0
2,-0.025074,-0.047549,0.014392,14.036753,14.03721,14.032653,0.0,0.0,0.0
3,-0.013463,-0.033017,0.01709,6.723039,6.726538,6.728223,0.0,0.0,0.0
4,0.051376,0.022379,0.020347,1.752698,1.749841,1.749517,0.0,0.0,0.0


In [18]:
stack_test_DNI.head()

Unnamed: 0,XGB_DNI1,XGB_DNI2,XGB_DNI3,LinReg_DNI1,LinReg_DNI2,LinReg_DNI3,RF_DNI1,RF_DNI2,RF_DNI3
0,0.01115,0.011308,0.016322,24.775132,24.782453,24.772826,0.0,0.0,0.0
1,0.01115,0.012839,0.016322,15.580816,15.583942,15.573857,0.0,0.0,0.0
2,-0.053133,0.003287,0.016322,11.375297,11.377894,11.371334,0.0,0.0,0.0
3,-0.053133,0.003287,0.016322,5.090155,5.092194,5.083503,0.0,0.0,0.0
4,-0.053133,0.003301,0.016322,-2.230003,-2.230591,-2.235526,0.0,0.0,0.0


# Stacking

In [19]:
metamodel = LinearRegression()
metamodel.fit(stack_train_DHI,train_dhi)
dhi_final_pred = metamodel.predict(stack_test_DHI)

In [20]:
metamodel = LinearRegression()
metamodel.fit(stack_train_GHI,train_ghi)
ghi_final_pred = metamodel.predict(stack_test_GHI)

In [21]:
metamodel = LinearRegression()
metamodel.fit(stack_train_DNI,train_dni)
dni_final_pred = metamodel.predict(stack_test_DNI)

In [22]:
submission = pd.DataFrame({'Clearsky DHI': dhi_final_pred, 'Clearsky DNI': dni_final_pred, 'Clearsky GHI': ghi_final_pred})

In [23]:
submission.to_csv('Stacking submit.csv', index = False)

In [24]:
sub = pd.read_csv('Stacking submit.csv')

In [25]:
sub.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,0.818458,104.429824,20.616908
1,0.697591,73.632291,5.626701
2,0.659407,59.657841,-2.941436
3,0.655558,39.698393,-12.488219
4,0.612758,15.229577,-24.625288
