In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, SCORERS
from xgboost import XGBRegressor,XGBClassifier
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from catboost import CatBoostRegressor
import pickle
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#data prep
trainset = pd.read_csv("/kaggle/input/mh-wipro-sustainable-ml-challenge/train.csv")
testset = pd.read_csv("/kaggle/input/mh-wipro-sustainable-ml-challenge/test.csv")

test = testset.drop(['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI'],axis = 1)

Y_cols = trainset.loc[:, ['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI']]
trainset = trainset.drop(['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI'], axis = 1)

train_dhi = Y_cols['Clearsky DHI']
train_dni = Y_cols['Clearsky DNI']
train_ghi = Y_cols['Clearsky GHI']

In [3]:
trainset.dtypes

Year                    int64
Month                   int64
Day                     int64
Hour                    int64
Minute                  int64
Cloud Type              int64
Dew Point             float64
Temperature           float64
Pressure                int64
Relative Humidity     float64
Solar Zenith Angle    float64
Precipitable Water    float64
Wind Direction        float64
Wind Speed            float64
Fill Flag               int64
dtype: object

In [4]:
train_dhi.dtypes

dtype('int64')

In [5]:
class Train:
    def __init__(self, data, target, save_name):
        super().__init__()
        self.data = data
        self.target = target
        self.save_name = save_name

    def train_loop(self):
        kf = KFold(shuffle = True, random_state = 42)
        best_loss = 1e18
        oof_preds = pd.DataFrame(columns = ['tar'], index = trainset.index)

        model = LinearRegression()
        print("Training starts...")
        for fold, (train_idx, test_idx) in enumerate(kf.split(self.data, self.target)):
            print("fold_",fold," done !")
            xt = trainset.iloc[train_idx, :]
            yt = train_dhi.iloc[train_idx]
            xv = trainset.iloc[test_idx, :]
            yv = train_dhi.iloc[test_idx]
            model.fit(xt, yt)
            pred = model.predict(xv)
            loss = mean_squared_error(yv, pred)

            oof_preds.loc[test_idx,'tar'] = pred #properly keeping oof preditcions according to test idx
            
            filename = self.save_name+".json"
            if(loss<best_loss):
#                 model.save_model(self.save_name+".json")
                pickle.dump(model, open(filename, "wb"))
                best_loss = loss
                
        
        return oof_preds

In [6]:
train = Train(trainset, train_dhi, 'model_DHI_1')
pred_df = train.train_loop()

trainset['Clearsky DHI']  = pred_df['tar']
convert_dict = {"Clearsky DHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DHI_1.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [7]:
trainset.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Clearsky DHI
0,2009,1,1,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0,23.50655
1,2009,1,1,0,30,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0,15.291682
2,2009,1,1,1,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0,13.819152
3,2009,1,1,1,30,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0,6.928181
4,2009,1,1,2,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0,1.305088


In [8]:
trainset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175296 entries, 0 to 175295
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Year                175296 non-null  int64  
 1   Month               175296 non-null  int64  
 2   Day                 175296 non-null  int64  
 3   Hour                175296 non-null  int64  
 4   Minute              175296 non-null  int64  
 5   Cloud Type          175296 non-null  int64  
 6   Dew Point           175296 non-null  float64
 7   Temperature         175296 non-null  float64
 8   Pressure            175296 non-null  int64  
 9   Relative Humidity   175296 non-null  float64
 10  Solar Zenith Angle  175296 non-null  float64
 11  Precipitable Water  175296 non-null  float64
 12  Wind Direction      175296 non-null  float64
 13  Wind Speed          175296 non-null  float64
 14  Fill Flag           175296 non-null  int64  
 15  Clearsky DHI        175296 non-nul

In [9]:
### trainset -> R+DHI
train = Train(trainset, train_ghi, 'model_GHI_1')
pred_df = train.train_loop()

trainset['Clearsky GHI']  = pred_df['tar']
convert_dict = {"Clearsky GHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_GHI_1.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [10]:
trainset.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Clearsky DHI,Clearsky GHI
0,2009,1,1,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0,23.50655,23.267277
1,2009,1,1,0,30,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0,15.291682,15.875487
2,2009,1,1,1,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0,13.819152,14.030578
3,2009,1,1,1,30,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0,6.928181,6.712169
4,2009,1,1,2,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0,1.305088,1.766179


In [11]:
### trainset -> R+DHI+GHI
train = Train(trainset, train_dni, 'model_DNI_1')
pred_df = train.train_loop()

trainset['Clearsky DNI']  = pred_df['tar']
convert_dict = {"Clearsky DNI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DNI_1.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [12]:
trainset = trainset.drop(['Clearsky DHI'], axis = 1)
### trainset -> R+GHI+DNI
train = Train(trainset, train_dhi, 'model_DHI_2')
pred_df = train.train_loop()

trainset['Clearsky DHI']  = pred_df['tar']
convert_dict = {"Clearsky DHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DHI_2.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [13]:
trainset = trainset.drop(['Clearsky GHI'], axis = 1)
### trainset -> R+DHI+DNI
train = Train(trainset, train_ghi, 'model_GHI_2')
pred_df = train.train_loop()

trainset['Clearsky GHI']  = pred_df['tar']
convert_dict = {"Clearsky GHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_GHI_2.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [14]:
trainset = trainset.drop(['Clearsky DNI'], axis = 1)
### trainset -> R+DHI+GHI
train = Train(trainset, train_dni, 'model_DNI_2')
pred_df = train.train_loop()

trainset['Clearsky DNI']  = pred_df['tar']
convert_dict = {"Clearsky DNI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DNI_2.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [15]:
trainset = trainset.drop(['Clearsky DHI'], axis = 1)
### trainset -> R+GHI+DNI
train = Train(trainset, train_dhi, 'model_DHI_3')
pred_df = train.train_loop()

trainset['Clearsky DHI']  = pred_df['tar']
convert_dict = {"Clearsky DHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DHI_3.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [16]:
trainset = trainset.drop(['Clearsky GHI'], axis = 1)
### trainset -> R+DHI+DNI
train = Train(trainset, train_ghi, 'model_GHI_3')
pred_df = train.train_loop()

trainset['Clearsky GHI']  = pred_df['tar']
convert_dict = {"Clearsky GHI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_GHI_3.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !


In [17]:
trainset = trainset.drop(['Clearsky DNI'], axis = 1)
### trainset -> R+DHI+GHI
train = Train(trainset, train_dni, 'model_DNI_3')
pred_df = train.train_loop()

trainset['Clearsky DNI']  = pred_df['tar']
convert_dict = {"Clearsky DNI":float}
trainset = trainset.astype(convert_dict)

pred_df.to_csv("oof_pred_DNI_3.csv",index=False)

Training starts...
fold_ 0  done !
fold_ 1  done !
fold_ 2  done !
fold_ 3  done !
fold_ 4  done !
