## **IMPORTING THE REQUIRED LIBRARIES**

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold
from scipy.stats import loguniform

## **LOADING THE DATASET**

In [None]:
df= pd.read_excel('Windfarm.xlsx')
df.head()

Unnamed: 0,Time(year-month-day h:m:s),Wind speed at height of 10 meters (m/s),Wind direction at height of 10 meters (˚),Wind speed at height of 30 meters (m/s),Wind direction at height of 30 meters (˚),Wind speed at height of 50 meters (m/s),Wind direction at height of 50 meters (˚),Wind speed - at the height of wheel hub (m/s),Wind speed - at the height of wheel hub (˚),Air temperature (°C),Atmosphere (hpa),Relative humidity (%),Power (MW)
0,2019-01-01 00:00:00,2.209,81.317,1.991,74.814,2.094,77.667,2.494,74.5,-13.484,889.867,76.32,0.254383
1,2019-01-01 00:15:00,1.828,77.46,1.698,75.048,1.757,88.733,1.882,74.367,-13.691,889.575,76.757,0.329703
2,2019-01-01 00:30:00,2.193,86.7,2.313,84.688,2.344,89.1,2.35,89.0,-13.766,889.942,76.981,0.296306
3,2019-01-01 00:45:00,2.654,78.16,2.494,74.939,2.574,87.267,2.808,82.733,-13.691,889.675,76.821,0.18759
4,2019-01-01 01:00:00,2.249,94.297,2.192,91.14,2.558,96.9,2.924,92.967,-13.447,890.025,74.571,0.081005


In [None]:
#Checking for null values
df.isnull().any()

Time(year-month-day h:m:s)                       False
Wind speed at height of 10 meters (m/s)          False
Wind direction at height of 10 meters (˚)        False
Wind speed at height of 30 meters (m/s)          False
Wind direction at height of 30 meters (˚)        False
Wind speed at height of 50 meters (m/s)          False
Wind direction at height of 50 meters (˚)        False
Wind speed - at the height of wheel hub (m/s)    False
Wind speed - at the height of wheel hub (˚)      False
Air temperature (°C)                             False
Atmosphere (hpa)                                 False
Relative humidity (%)                            False
Power (MW)                                       False
dtype: bool

In [None]:
#renaming a column name
df.rename(columns = {'Power (MW)':'Power'}, inplace = True)
df.rename(columns={'Time(year-month-day h:m:s)':'Time'},inplace = True)
df

Unnamed: 0,Time,Wind speed at height of 10 meters (m/s),Wind direction at height of 10 meters (˚),Wind speed at height of 30 meters (m/s),Wind direction at height of 30 meters (˚),Wind speed at height of 50 meters (m/s),Wind direction at height of 50 meters (˚),Wind speed - at the height of wheel hub (m/s),Wind speed - at the height of wheel hub (˚),Air temperature (°C),Atmosphere (hpa),Relative humidity (%),Power
0,2019-01-01 00:00:00,2.209,81.317,1.991,74.814,2.094,77.667,2.494,74.500,-13.484,889.867,76.320,0.254383
1,2019-01-01 00:15:00,1.828,77.460,1.698,75.048,1.757,88.733,1.882,74.367,-13.691,889.575,76.757,0.329703
2,2019-01-01 00:30:00,2.193,86.700,2.313,84.688,2.344,89.100,2.350,89.000,-13.766,889.942,76.981,0.296306
3,2019-01-01 00:45:00,2.654,78.160,2.494,74.939,2.574,87.267,2.808,82.733,-13.691,889.675,76.821,0.187590
4,2019-01-01 01:00:00,2.249,94.297,2.192,91.140,2.558,96.900,2.924,92.967,-13.447,890.025,74.571,0.081005
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70171,2020-12-31 22:45:00,5.738,234.600,6.408,235.454,6.575,242.033,6.680,238.433,-9.108,890.008,53.312,13.633642
70172,2020-12-31 23:00:00,5.748,229.333,6.231,233.423,6.372,239.800,6.448,236.833,-9.978,889.817,53.877,6.869759
70173,2020-12-31 23:15:00,4.468,235.133,4.406,253.261,4.381,263.400,4.399,261.733,-9.808,889.825,54.437,4.352931
70174,2020-12-31 23:30:00,3.730,242.580,3.280,252.492,3.204,268.600,3.446,266.733,-9.633,889.492,54.677,7.185250


## **SPLITTING THE VARIABLES INTO DEPENDENT AND INDEPENDENT VARIABLES**

In [None]:
#dependent variable
y=df['Power']
y

0         0.254383
1         0.329703
2         0.296306
3         0.187590
4         0.081005
           ...    
70171    13.633642
70172     6.869759
70173     4.352931
70174     7.185250
70175    12.384466
Name: Power, Length: 70176, dtype: float64

In [None]:
#independent variable
X=df.drop(columns=['Power','Time'],axis=1)
X.head()

Unnamed: 0,Wind speed at height of 10 meters (m/s),Wind direction at height of 10 meters (˚),Wind speed at height of 30 meters (m/s),Wind direction at height of 30 meters (˚),Wind speed at height of 50 meters (m/s),Wind direction at height of 50 meters (˚),Wind speed - at the height of wheel hub (m/s),Wind speed - at the height of wheel hub (˚),Air temperature (°C),Atmosphere (hpa),Relative humidity (%)
0,2.209,81.317,1.991,74.814,2.094,77.667,2.494,74.5,-13.484,889.867,76.32
1,1.828,77.46,1.698,75.048,1.757,88.733,1.882,74.367,-13.691,889.575,76.757
2,2.193,86.7,2.313,84.688,2.344,89.1,2.35,89.0,-13.766,889.942,76.981
3,2.654,78.16,2.494,74.939,2.574,87.267,2.808,82.733,-13.691,889.675,76.821
4,2.249,94.297,2.192,91.14,2.558,96.9,2.924,92.967,-13.447,890.025,74.571


## **SPLITTING THE DATASET INTO TRAINING DATA AND TESTING DATA**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
#0.3 indicates 30% test dataset and remaining 70% training dataset which is ideal size of dataset for ml algorithms training and testing 

In [None]:
X_train

Unnamed: 0,Wind speed at height of 10 meters (m/s),Wind direction at height of 10 meters (˚),Wind speed at height of 30 meters (m/s),Wind direction at height of 30 meters (˚),Wind speed at height of 50 meters (m/s),Wind direction at height of 50 meters (˚),Wind speed - at the height of wheel hub (m/s),Wind speed - at the height of wheel hub (˚),Air temperature (°C),Atmosphere (hpa),Relative humidity (%)
63536,1.631,169.873,1.523,137.859,0.861,142.867,1.412,140.867,9.637,890.017,24.992
21654,4.767,294.853,4.646,292.855,5.020,298.733,5.605,296.467,31.287,889.467,10.667
2093,1.602,259.433,1.888,282.533,3.430,316.200,5.276,325.333,-6.734,889.533,61.360
61223,13.250,288.053,14.901,282.997,14.910,286.033,15.507,284.533,8.494,889.675,58.448
31193,10.090,235.367,11.646,257.968,12.232,262.367,12.430,258.833,-3.739,889.925,60.640
...,...,...,...,...,...,...,...,...,...,...,...
21243,7.177,219.620,8.028,214.812,7.453,218.833,6.667,215.633,21.352,889.583,20.891
45891,1.383,180.110,0.060,89.039,0.506,92.133,1.857,86.000,11.978,889.633,11.776
42613,4.724,249.713,5.846,258.673,6.221,264.933,6.434,262.133,10.316,889.817,41.109
43567,12.263,244.300,13.435,275.286,14.030,277.833,14.614,275.733,2.877,889.792,60.389


In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## **TRAINING THE MODEL**

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor (n_estimators=30, random_state=0)
regressor.fit(X_train,y_train)

RandomForestRegressor(n_estimators=30, random_state=0)

## **TESTING THE MODEL**

In [None]:
y_pred=regressor.predict(X_test)
y_pred

array([33.26070127, 25.18657937,  0.32804497, ...,  1.21720073,
        0.49453063, 70.8779573 ])

In [None]:
Acc=pd.DataFrame({'Actual_y_value':y_test,'Predicted_y_value':y_pred})
Acc

Unnamed: 0,Actual_y_value,Predicted_y_value
50585,41.646313,33.260701
1610,26.752834,25.186579
68970,0.186879,0.328045
66444,1.792049,0.463834
38116,0.140692,0.483375
...,...,...
43785,7.865973,8.338973
34494,49.145634,51.113949
30848,1.079351,1.217201
33926,0.900288,0.494531


## **EVALUATING THE MODEL WITH EVALUATION METRICS**

In [None]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error: 4.972486590131098
Mean Squared Error: 67.60709947932519
Root Mean Squared Error: 8.222353645965685


## **HYPERPARAMETER TUNING USING RANDOM SEARCH CV**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(n_estimators=10, random_state=0)
# Random search of parameters, using 3 fold cross validation, 
# search across different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 3, cv = 3, verbose=2, random_state=4, n_jobs = -1)
# Fit the random search model
result = rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits




## **EVALUATION USING METRICS**

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.8849999426996336

In [None]:
print('Best Score: %s' % result.best_score_)

Best Score: 0.8808203361658858


## **TESTING THE MODEL BY GIVING VALUES**

In [None]:
rf_random.predict([[1.9,74,2.0,73,2.225,78,2.56,75,10.4,465,67]])

array([64.02966857])

In [None]:
regressor.predict([[1.9,74,2.0,73,2.225,78,2.56,75,10.4,465,67]])

array([63.76622687])

## **SAVING THE TRAINED MODEL**

In [None]:
import joblib
joblib.dump(rf_random,'model.pkl')

['model.pkl']