In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error , r2_score
import joblib
%matplotlib inline


In [6]:
data = pd.read_csv('./dataset.csv')
data.head()

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
0,01 01 2018 00:00,380.047791,5.311336,416.328908,259.994904
1,01 01 2018 00:10,453.769196,5.672167,519.917511,268.641113
2,01 01 2018 00:20,306.376587,5.216037,390.900016,272.564789
3,01 01 2018 00:30,419.645905,5.659674,516.127569,271.258087
4,01 01 2018 00:40,380.650696,5.577941,491.702972,265.674286


In [7]:
data.shape

(50530, 5)

In [8]:
data.describe()

Unnamed: 0,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
count,50530.0,50530.0,50530.0,50530.0
mean,1307.684332,7.557952,1492.175463,123.687559
std,1312.459242,4.227166,1368.018238,93.443736
min,-2.471405,0.0,0.0,0.0
25%,50.67789,4.201395,161.328167,49.315437
50%,825.838074,7.104594,1063.776283,73.712978
75%,2482.507568,10.30002,2964.972462,201.69672
max,3618.73291,25.206011,3600.0,359.997589


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50530 entries, 0 to 50529
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Date/Time                      50530 non-null  object 
 1   LV ActivePower (kW)            50530 non-null  float64
 2   Wind Speed (m/s)               50530 non-null  float64
 3   Theoretical_Power_Curve (KWh)  50530 non-null  float64
 4   Wind Direction (°)             50530 non-null  float64
dtypes: float64(4), object(1)
memory usage: 1.9+ MB


In [10]:
data.isnull().any()

Date/Time                        False
LV ActivePower (kW)              False
Wind Speed (m/s)                 False
Theoretical_Power_Curve (KWh)    False
Wind Direction (°)               False
dtype: bool

DATA PREPROCESSING

In [11]:
data['Date/Time'] = pd.to_datetime(data['Date/Time'],format='%d %m %Y %H:%M')
data['year'] = data['Date/Time'].dt.year 
data['month'] = data['Date/Time'].dt.month 
data['day'] = data['Date/Time'].dt.day
data['Hour'] = data['Date/Time'].dt.hour 
data['minute'] = data['Date/Time'].dt.minute 
data.head()

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°),year,month,day,Hour,minute
0,2018-01-01 00:00:00,380.047791,5.311336,416.328908,259.994904,2018,1,1,0,0
1,2018-01-01 00:10:00,453.769196,5.672167,519.917511,268.641113,2018,1,1,0,10
2,2018-01-01 00:20:00,306.376587,5.216037,390.900016,272.564789,2018,1,1,0,20
3,2018-01-01 00:30:00,419.645905,5.659674,516.127569,271.258087,2018,1,1,0,30
4,2018-01-01 00:40:00,380.650696,5.577941,491.702972,265.674286,2018,1,1,0,40


In [12]:
data["Date/Time"] = pd.to_datetime(data["Date/Time"], format = "%d %m %Y %H:%M", errors = "coerce")
data

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°),year,month,day,Hour,minute
0,2018-01-01 00:00:00,380.047791,5.311336,416.328908,259.994904,2018,1,1,0,0
1,2018-01-01 00:10:00,453.769196,5.672167,519.917511,268.641113,2018,1,1,0,10
2,2018-01-01 00:20:00,306.376587,5.216037,390.900016,272.564789,2018,1,1,0,20
3,2018-01-01 00:30:00,419.645905,5.659674,516.127569,271.258087,2018,1,1,0,30
4,2018-01-01 00:40:00,380.650696,5.577941,491.702972,265.674286,2018,1,1,0,40
...,...,...,...,...,...,...,...,...,...,...
50525,2018-12-31 23:10:00,2963.980957,11.404030,3397.190793,80.502724,2018,12,31,23,10
50526,2018-12-31 23:20:00,1684.353027,7.332648,1173.055771,84.062599,2018,12,31,23,20
50527,2018-12-31 23:30:00,2201.106934,8.435358,1788.284755,84.742500,2018,12,31,23,30
50528,2018-12-31 23:40:00,2515.694092,9.421366,2418.382503,84.297913,2018,12,31,23,40


DATASET SPLITTING

In [14]:
X=data[['Wind Speed (m/s)','Wind Direction (°)']]
X.head()

Unnamed: 0,Wind Speed (m/s),Wind Direction (°)
0,5.311336,259.994904
1,5.672167,268.641113
2,5.216037,272.564789
3,5.659674,271.258087
4,5.577941,265.674286


In [21]:
X_train, X_test,y_train, y_test = train_test_split(X,y ,
                                   random_state=6, 
                                   test_size=0.25)

IMPORTING THE REGRESSION MODELS

In [19]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score,r2_score,mean_squared_error
xgr=XGBRegressor()
rf=RandomForestRegressor()
lr=LinearRegression()
dt=DecisionTreeRegressor()
sm=SVR()

FILTERING THE MODELS WITH DATASET

In [22]:
model_xg=xgr.fit(X_train,y_train)
y_xg=model_xg.predict(X_test)
model_rf=rf.fit(X_train,y_train)
y_rf=model_rf.predict(X_test)
model_lr=lr.fit(X_train,y_train)
y_lr=model_lr.predict(X_test)
model_dt=dt.fit(X_train,y_train)
y_dt=model_dt.predict(X_test)
model_sm=sm.fit(X_train,y_train)
y_sm=model_sm.predict(X_test)

CHECKING THE METRICS

In [23]:
print('R2-xgb',r2_score(y_test,y_xg))
print('RMSE-xgb',np.sqrt(mean_squared_error(y_test,y_xg)))

print('R2-rf',r2_score(y_test,y_rf))
print('RMSE-rf',np.sqrt(mean_squared_error(y_test,y_rf)))

print('R2-lr',r2_score(y_test,y_lr))
print('RMSE-lr',np.sqrt(mean_squared_error(y_test,y_lr)))

print('R2-dt',r2_score(y_test,y_dt))
print('RMSE-dt',np.sqrt(mean_squared_error(y_test,y_dt)))

print('R2-svm',r2_score(y_test,y_sm))
print('RMSE-svm',np.sqrt(mean_squared_error(y_test,y_sm)))

R2-xgb 0.9197743106205652
RMSE-xgb 370.6768884049128
R2-rf 0.9091639024761513
RMSE-rf 394.4282510406997
R2-lr 0.8368251429450981
RMSE-lr 528.646547634677
R2-dt 0.8413476524047431
RMSE-dt 521.269159878656
R2-svm 0.005368134807760105
RMSE-svm 1305.1786596858901


HYPERPARAMETER TURNING

In [24]:
params={
 "learning_rate"    : [0.05, 0.01,0.03,0.1, 0.15, 0.2] ,
 "n_estimators"     : [50, 100, 150, 200, 500, 800,1000,1500] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15,20,25],
 "min_child_weight" : [ 1, 3, 5, 7 ,10,15,20,25],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "subsample"        : [ 0.1, 0.2 , 0.3, 0.4,0.6,0.8,1 ],
 "reg_lambda"       : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ,0.6,0.8,1],
 "reg_alpha"        : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7,0.9 ],
 "colsample_bylevel" : [ 0.3, 0.4, 0.5 , 0.7,0.9 ]
    
}

In [25]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [26]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [28]:
random_search=RandomizedSearchCV(xgr,param_distributions=params,n_iter=10,n_jobs=-1,cv=5,verbose=3)
from datetime import datetime
start_time = timer(None)
random_search.fit(X_train,y_train)
timer(start_time)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 3/5] END colsample_bylevel=0.7, colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=3, min_child_weight=10, n_estimators=500, reg_alpha=0.1, reg_lambda=0.2, subsample=0.2;, score=0.906 total time=   1.8s
[CV 1/5] END colsample_bylevel=0.7, colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=3, min_child_weight=10, n_estimators=500, reg_alpha=0.1, reg_lambda=0.2, subsample=0.2;, score=0.908 total time=   1.7s
[CV 2/5] END colsample_bylevel=0.7, colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=3, min_child_weight=10, n_estimators=500, reg_alpha=0.1, reg_lambda=0.2, subsample=0.2;, score=0.904 total time=   1.8s
[CV 4/5] END colsample_bylevel=0.7, colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=3, min_child_weight=10, n_estimators=500, reg_alpha=0.1, reg_lambda=0.2, subsample=0.2;, score=0.910 total time=   1.5s
[CV 5/5] END colsample_bylevel=0.7, colsample_bytree=0.7, gamma=0.0

In [29]:
random_search.best_estimator_

In [30]:
xg=XGBRegressor(colsample_bylevel=0.4, colsample_bytree=0.3, gamma=0.1,
             learning_rate=0.01, max_depth=6, min_child_weight=25,
             n_estimators=1500, reg_alpha=0.1, reg_lambda=0.8, subsample=0.6)
x=xgr.fit(X_train,y_train)
y1=x.predict(X_test)
r2_score(y_test,y1)

0.9197743106205652