# Loading the data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score, mean_absolute_error, max_error

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
mydata = pd.read_csv('/kaggle/input/SolarEnergy/SolarPrediction.csv')

# Data Preprocessing

In [None]:
mydata

In [None]:
import datetime

#Date

mydata['Year'] = pd.DatetimeIndex(mydata['Data']).year
mydata['Month'] = pd.DatetimeIndex(mydata['Data']).month
mydata['Day'] = pd.DatetimeIndex(mydata['Data']).day
mydata.head()

#Time 

mydata['Hour'] = pd.DatetimeIndex(mydata['Time']).hour
mydata['Minute'] = pd.DatetimeIndex(mydata['Time']).minute
mydata['Second'] = pd.DatetimeIndex(mydata['Time']).second

mydata.head()


mydata['SunPerDay'] = pd.DatetimeIndex(mydata['TimeSunSet']) - pd.DatetimeIndex(mydata['TimeSunRise'])
mydata.head()

mydata['SunPerDayHours'] = pd.DatetimeIndex(mydata['TimeSunSet']).hour - pd.DatetimeIndex(mydata['TimeSunRise']).hour 



In [None]:
mydata.drop('Time', axis = 1, inplace=True)
mydata.drop('Data', axis = 1, inplace=True)
mydata.drop('TimeSunRise', axis = 1, inplace=True)
mydata.drop('TimeSunSet', axis = 1, inplace=True)
mydata.drop('SunPerDay', axis = 1, inplace=True)

mydata.head()

In [None]:
mydata.isnull().sum()

We don't have Null values

# Filtering data

In [None]:
mydata.info()

In [None]:
mydata.describe()

In [None]:
fig = plt.figure(figsize=(20,10))
fig.suptitle('Feature Correlation', fontsize=18)
sns.heatmap(mydata.corr(), annot=True, cmap='RdBu', center=0)

In [None]:
# Eliminating the pozitive correlated data
mydata.drop('UNIXTime', axis = 1, inplace=True)
mydata.drop('Year', axis = 1, inplace=True)

In [None]:
mydata.head()

In [None]:
fig = plt.figure(figsize=(20,10))
fig.suptitle('Feature Correlation', fontsize=18)
sns.heatmap(mydata.corr(), annot=True, cmap='RdBu', center=0)

In [None]:
import seaborn as sns

fig2 = plt.figure(figsize=(15,5))
sns.barplot(x=mydata['Temperature'],y=mydata['Radiation'])

Temperature is directly proportional with the radiation, so it is an important feature

In [None]:
fig3 = plt.figure(figsize=(15,5))
sns.barplot(x=mydata['Humidity'],y=mydata['Radiation'])

# Outliers removal
Applying Z score for every feature and keeping the one that its absolute value is smaller than our threshhold(3)

In [None]:
# Temperature	Pressure	Humidity	WindDirection(Degrees)	Speed	Month	Day	Hour	Minute	Second	SunPerDayHours
threshold = 4

outliers = [] 

for i in mydata['Temperature']: 
    z = (i- np.mean(mydata['Temperature']))/np.std(mydata['Temperature'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Temperature are: ', outliers) 

In [None]:
outliers = [] 

for i in mydata['Pressure']: 
    z = (i- np.mean(mydata['Pressure']))/np.std(mydata['Pressure'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Pressure are: ', outliers)

In [None]:
outliers = [] 

for i in mydata['Humidity']: 
    z = (i- np.mean(mydata['Humidity']))/np.std(mydata['Humidity'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Humidity are: ', outliers)

In [None]:
outliers = [] 

for i in mydata['WindDirection(Degrees)']: 
    z = (i- np.mean(mydata['WindDirection(Degrees)']))/np.std(mydata['WindDirection(Degrees)'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in WindDirection(Degrees): ', outliers)

In [None]:
outliers = [] 

for i in mydata['Speed']: 
    z = (i- np.mean(mydata['Speed']))/np.std(mydata['Speed'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Speed: ', outliers)

In [None]:
outliers = [] 

for i in mydata['Month']: 
    z = (i- np.mean(mydata['Month']))/np.std(mydata['Month'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Month: ', outliers)

In [None]:
threshold = 4
outliers = [] 

for i in mydata['Day']: 
    z = (i- np.mean(mydata['Day']))/np.std(mydata['Day'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Day: ', outliers)

In [None]:
outliers = [] 

for i in mydata['Hour']: 
    z = (i- np.mean(mydata['Hour']))/np.std(mydata['Hour'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Hour: ', outliers)

In [None]:
outliers = [] 

for i in mydata['Minute']: 
    z = (i- np.mean(mydata['Minute']))/np.std(mydata['Minute'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Minute: ', outliers)

In [None]:
outliers = [] 

for i in mydata['Second']: 
    z = (i- np.mean(mydata['Second']))/np.std(mydata['Second'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in Second: ', outliers)

In [None]:
outliers = [] 

for i in mydata['SunPerDayHours']: 
    z = (i- np.mean(mydata['SunPerDayHours']))/np.std(mydata['SunPerDayHours'])
    if z > threshold: 
        outliers.append(i) 
print('The outliers in SunPerDayHours: ', outliers)

In [None]:
plt.figure(figsize=(20,10))

distr = mydata[["Temperature","Pressure","Humidity","WindDirection(Degrees)","Speed"]]

for i, column in enumerate(distr):
    plt.subplot(2,3,i+1)
    sns.histplot(distr[column],kde=True)

In [None]:
from scipy import stats

mydata[(np.abs(stats.zscore(mydata)) < 4).all(axis=1)]

# Spliting the data

In [None]:
mydata1 = mydata.iloc[:,1:]
labels = mydata.iloc[:,0]

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(mydata1, labels, test_size = 0.2, random_state = 25)

# Models

In [None]:
mae_list = []
mse_list = []
r2_list = []
var_list = []
max_list = []

## Linear Regression

In [None]:
model1 = LinearRegression()
model1.fit(train_data, train_labels)

columns_ = [ 'Temperature',	'Pressure',	'Humidity',	'WindDirection(Degrees)',	'Speed',	'Month',	'Day',	'Hour',	'Minute',	'Second', 'SunPerDayHours']

plt.figure(figsize=(20,10))
plt.bar(columns_, model1.coef_)

feature_importances = pd.DataFrame(model1.coef_,index = columns_,columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)

In [None]:
Model1 = LinearRegression()
Model1.fit(train_data, train_labels)
Pred1 = Model1.predict(test_data)

In [None]:
# Evaluation
mae_list.append(mean_absolute_error(test_labels, Pred1))
mse_list.append(mean_squared_error(test_labels, Pred1))
r2_list.append(r2_score(test_labels, Pred1))
var_list.append(explained_variance_score(test_labels, Pred1))
max_list.append(max_error(test_labels, Pred1))

print('MAE : ', mean_absolute_error(test_labels, Pred1))
print('MSE : ', mean_squared_error(test_labels, Pred1))
print('R^2 : ', r2_score(test_labels, Pred1))
print('Var : ', explained_variance_score(test_labels, Pred1))
print('Max : ', max_error(test_labels, Pred1))

## RandomForestRegressor

In [None]:
model2 = RandomForestRegressor()
model2.fit(train_data,train_labels)

columns_ = [ 'Temperature',	'Pressure',	'Humidity',	'WindDirection(Degrees)',	'Speed',	'Month',	'Day',	'Hour',	'Minute',	'Second', 'SunPerDayHours']

plt.figure(figsize=(20,10))
plt.bar(columns_, model2.feature_importances_)

feature_importances = pd.DataFrame(model2.feature_importances_,index = columns_,columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)

In [None]:
# param_grid={
#             'max_depth': [None, 3, 7],
#             'n_estimators': [100, 250, 500],
#             'max_features': ['auto', 'sqrt', 'log2'],
#             }

# grid_search = GridSearchCV(
#             estimator=RandomForestRegressor(),
#             param_grid=param_grid,
#             cv=5, scoring='neg_mean_squared_error', verbose=10, n_jobs=-1)

# grid_result = grid_search.fit(train_data, train_labels)
# results = grid_search.cv_results_
# for mean_score, params in zip(results['mean_test_score'], results['params']):
#     print("Mean score", np.sqrt(-mean_score), "with the following params")
#     print(params)

# print("Best estim: ")
# print(grid_search.best_estimator_)

In [None]:
Model2 = RandomForestRegressor(max_depth = None, n_estimators = 500, max_features='auto')
Model2.fit(train_data, train_labels)
Pred2 = Model2.predict(test_data)

In [None]:
# Evaluation
mae_list.append(mean_absolute_error(test_labels, Pred2))
mse_list.append(mean_squared_error(test_labels, Pred2))
r2_list.append(r2_score(test_labels, Pred2))
var_list.append(explained_variance_score(test_labels, Pred2))
max_list.append(max_error(test_labels, Pred2))

print('MAE : ', mean_absolute_error(test_labels, Pred2 ))
print('MSE : ', mean_squared_error(test_labels, Pred2 ))
print('R^2 : ', r2_score(test_labels, Pred2 ))
print('Var : ', explained_variance_score(test_labels, Pred2 ))
print('Max : ', max_error(test_labels, Pred2 ))

## XgBoost

In [None]:
model3 = XGBRegressor()
model3.fit(train_data, train_labels)

columns_ = [ 'Temperature',	'Pressure',	'Humidity',	'WindDirection(Degrees)',	'Speed',	'Month',	'Day',	'Hour',	'Minute',	'Second', 'SunPerDayHours']

plt.figure(figsize=(20,10))
plt.bar(columns_, model3.feature_importances_)

feature_importances = pd.DataFrame(model3.feature_importances_,index = columns_,columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)


In [None]:
#param_grid = {
#            'min_child_weight': [1, 4, 7],
#            'gamma': [0, 0.5, 1],
#            'subsample': [0.8, 1],
#            'colsample_bytree':[0.8, 1],
#            'max_depth': [3, 6, 9],
#            }

#grid_search = GridSearchCV(
#            estimator=XGBRegressor(objective = 'reg:squarederror'),
#            param_grid=param_grid,
#            cv=5, scoring='neg_mean_squared_error', verbose=10, n_jobs=-1)

#grid_result = grid_search.fit(train_data, train_labels)
#results = grid_search.cv_results_
#for mean_score, params in zip(results['mean_test_score'], results['params']):
#    print("Mean score", np.sqrt(-mean_score), "with the following params")
#    print(params)

#print("Best estim: ")
#print(grid_search.best_estimator_)

In [None]:
Model3 = XGBRegressor(objective = 'reg:squarederror',
                      colsample_bytree=0.8, gamma=0,
                      max_depth=9, min_child_weight=4,
                      subsample=0.8)
Model3.fit(train_data, train_labels)
Pred3 = Model3.predict(test_data)

In [None]:
mae_list.append(mean_absolute_error(test_labels, Pred3))
mse_list.append(mean_squared_error(test_labels, Pred3))
r2_list.append(r2_score(test_labels, Pred3))
var_list.append(explained_variance_score(test_labels, Pred3))
max_list.append(max_error(test_labels, Pred3))


print('MAE : ', mean_absolute_error(test_labels, Pred3 ))
print('MSE : ', mean_squared_error(test_labels, Pred3 ))
print('R^2 : ', r2_score(test_labels, Pred3 ))
print('Var : ', explained_variance_score(test_labels, Pred3 ))
print('Max : ', max_error(test_labels, Pred3 ))

## Decission Trees

In [None]:
model4 = DecisionTreeRegressor()
model4.fit(train_data, train_labels)

columns_ = [ 'Temperature',	'Pressure',	'Humidity',	'WindDirection(Degrees)',	'Speed',	'Month',	'Day',	'Hour',	'Minute',	'Second', 'SunPerDayHours']

plt.figure(figsize=(20,10))
plt.bar(columns_, model4.feature_importances_)

feature_importances = pd.DataFrame(model4.feature_importances_,index = columns_,columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)

In [None]:
# param_grid = {
#     'max_depth': [None, 3, 5, 10], 
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'random_state': [None, 1, 2, 3, 4], 
#     'min_samples_split': [2, 3, 4]}    

# grid_search = GridSearchCV(estimator = DecisionTreeRegressor(), param_grid=param_grid, cv=5,
#                            scoring='neg_mean_squared_error', verbose=10, n_jobs=-1)
# grid_search.fit(train_data, train_labels)

# results = grid_search.cv_results_
# for mean_score, params in zip(results['mean_test_score'], results['params']):
#     print("Mean score", np.sqrt(-mean_score), "with the following params",params)

# print("Best estim: ")
# grid_search.best_estimator_

In [None]:
Model4 = DecisionTreeRegressor(max_depth=10, max_features='auto', min_samples_split=4, random_state=1)
Model4.fit(train_data, train_labels)
Pred4 = Model4.predict(test_data)

In [None]:
mae_list.append(mean_absolute_error(test_labels, Pred4))
mse_list.append(mean_squared_error(test_labels, Pred4))
r2_list.append(r2_score(test_labels, Pred4))
var_list.append(explained_variance_score(test_labels, Pred4))
max_list.append(max_error(test_labels, Pred4))

print('MAE : ', mean_absolute_error(test_labels, Pred4 ))
print('MSE : ', mean_squared_error(test_labels, Pred4 ))
print('R^2 : ', r2_score(test_labels, Pred4 ))
print('Var : ', explained_variance_score(test_labels, Pred4 ))
print('Max : ', max_error(test_labels, Pred4 ))

# Model Evaluation

In [None]:
print('      LinearRegression      RandomForest        XGBoost         DecisionTrees')
print('MAE',mae_list)
print('MSE',mse_list)
print('R2',r2_list)
print('Var ', var_list)
print('Max', max_list)

In [None]:
plot_labels = ["LinR", "RanFor", "XGB", "DecTr"]
from math import sqrt

f, [ax1, ax2, ax3, ax4, ax5] = plt.subplots(nrows=1,ncols=5,figsize=(15,5))
ax1.bar(plot_labels,mae_list,color='r')
ax1.set_title("Mean Absolute Error")
ax2.bar(plot_labels,mse_list,color='g')
ax2.set_title("Mean Squared Error")
ax3.bar(plot_labels,r2_list,color='y')
ax3.set_title("R^2 Error")
ax4.bar(plot_labels,var_list,color='b')
ax4.set_title("Var Error")
import operator


#  Further improvements discussed at the presentation

## Comparison between Normalized Data and Un-normalized Data 

After obtaining this result we decided to continue without normalizing the data.

In [None]:
# Without Normalization
MAE_wn =  [145.53663438816486, 30.260374034873056, 36.45372562953167, 42.841940617746474]
MSE_wn = [36765.7328385265, 6494.071158381085, 7011.396019976234, 10347.625717452076]
R2_wn = [0.6308764850087383, 0.9348003103021767, 0.9296064312044026, 0.8961110881859694]
Var_wn = [0.6309326107714461, 0.9348176119897434, 0.9296161064797657, 0.896182079743132]
Max_wn = [1047.1185829606309, 881.2118000000008, 1033.2645776367187, 922.72375]

# With Normalization
MAE = [146.07357742508327, 31.151382492351622, 34.81606086339533, 43.89330016762526]
MSE = [37102.33934259632, 6746.756748558653, 6788.594843960765, 10836.24382995925]
R2 = [0.6253056145329912, 0.9318648926566667, 0.9314423721438481, 0.8905653984471072]
Var = [0.6253289774456783, 0.9318789405433018, 0.9314424169220911, 0.8906053155263609]
Max = [1046.4516532323953, 900.2392800000011, 1080.5108544921875, 1042.6072857142858]

print('      LinearRegression      RandomForest        XGBoost         DecisionTrees')
print('MAE_wn-MAE', list(map(operator.sub, MAE_wn, MAE)))
print('MSE_wn-MSE', list(map(operator.sub, MSE_wn, MSE)))
print('R2_wn-R2', list(map(operator.sub, R2_wn, R2)))
print('Var_wn-Var', list(map(operator.sub, Var_wn, Var)))
print('Max_wn-Max', list(map(operator.sub, Max_wn, Max)))
ax5.set_title("Max Error")
plt.show()


plot_labels = ["LinR", "RanFor", "XGB", "DecTr"]
from math import sqrt

f, [ax1, ax2, ax3, ax4, ax5] = plt.subplots(nrows=1,ncols=5,figsize=(15,5))
ax1.bar(plot_labels,list(map(operator.sub, MAE_wn, MAE)),color='r')
ax1.set_title("Mean Absolute Error")
ax2.bar(plot_labels,list(map(operator.sub, MSE_wn, MSE)),color='g')
ax2.set_title("Mean Squared Error")
ax3.bar(plot_labels,list(map(operator.sub, R2_wn, R2)),color='y')
ax3.set_title("R^2 Error")
ax4.bar(plot_labels,list(map(operator.sub, Var_wn, Var)),color='b')
ax4.set_title("Var Error")
ax5.bar(plot_labels,list(map(operator.sub, Max_wn, Max)))
ax5.set_title("Max Error")
plt.show()


##Decition Tree with prunning parameter

In [None]:
 #param_grid = {
#     'max_depth': [None, 3, 5, 10], 
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'random_state': [None, 1, 2, 3, 4], 
#     'min_samples_split': [2, 3, 4],
#     'ccp_alpha': ['non-negative', 'float', 0]}    
# grid_search = GridSearchCV(estimator = DecisionTreeRegressor(), param_grid=param_grid, cv=5,
#                            scoring='neg_mean_squared_error', verbose=10, n_jobs=-1)
# grid_search.fit(train_data, train_labels)
# results = grid_search.cv_results_
# for mean_score, params in zip(results['mean_test_score'], results['params']):
#     print("Mean score", np.sqrt(-mean_score), "with the following params",params)

# print("Best estim: ")
# grid_search.best_estimator_

Best estim: 
DecisionTreeRegressor(
    
    ccp_alpha=0.0, criterion='mse', max_depth=10,
                      
    max_features='auto', max_leaf_nodes=None,
             
    min_impurity_decrease=0.0, min_impurity_split=None,
                      
    min_samples_leaf=1, min_samples_split=3,
                      
    min_weight_fraction_leaf=0.0, presort='deprecated',
                      
    random_state=3, splitter='best')

In [None]:
Model4 = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      
                      max_features='auto', max_leaf_nodes=None,
                      
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      
                      min_samples_leaf=1, min_samples_split=3,
                      
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      
                      random_state=3, splitter='best')
Model4.fit(train_data, train_labels)
Pred4 = Model4.predict(test_data)

print('MAE : ', mean_absolute_error(test_labels, Pred4 ))
print('MSE : ', mean_squared_error(test_labels, Pred4 ))
print('R^2 : ', r2_score(test_labels, Pred4 ))
print('Var : ', explained_variance_score(test_labels, Pred4 ))
print('Max : ', max_error(test_labels, Pred4 ))

Old Parameters -------------------------------New Parameters

MAE :  42.841940617746474-----------42.55817503102817

MSE :  10347.625717452076-----------10098.317452316605

R^2 :  0.8961110881859694-----------0.8986141130419502

Var :  0.896182079743132------------0.8986633719110002

Max :  922.72375--------------------922.72375

## XgBoost with learning rate and number of estimators

In [None]:
#param_grid = {
#            'learning rate': [0.01, 0.05, 0.1, 0.2, 0.3],
#            'min_child_weight': [1, 4, 7],
#            'gamma': [0, 0.5, 1],
#            'subsample': [0.8, 1],
#            'colsample_bytree':[0.8, 1],
#            'max_depth': [3, 6, 9],
#            'n-estimators': [50,100,200]
#            }#

#grid_search = GridSearchCV(
#            estimator=XGBRegressor(objective = 'reg:squarederror'),
#            param_grid=param_grid,
#            cv=5, scoring='neg_mean_squared_error', verbose=10, n_jobs=-1)

#grid_result = grid_search.fit(train_data, train_labels)
#results = grid_search.cv_results_
#for mean_score, params in zip(results['mean_test_score'], results['params']):
#    print("Mean score", np.sqrt(-mean_score), "with the following params")
#    print(params)

#print("Best estim: ")
#print(grid_search.best_estimator_)

In [None]:
Model3 = XGBRegressor(objective = 'reg:squarederror',colsample_bytree=0.8, gamma=0,
                      max_depth=9, min_child_weight=4, learning_rate=0.05, subsample=0.8,
            n_estimators=200)
Model3.fit(train_data, train_labels)
Pred3 = Model3.predict(test_data)

print('MAE : ', mean_absolute_error(test_labels, Pred3 ))
print('MSE : ', mean_squared_error(test_labels, Pred3 ))
print('R^2 : ', r2_score(test_labels, Pred3 ))
print('Var : ', explained_variance_score(test_labels, Pred3 ))
print('Max : ', max_error(test_labels, Pred3 ))

Old parameters New------------------Parameters

MAE :  36.45372562953167-------34.82895552785271

MSE :  7011.396019976234-------6696.24227124527

R^2 :  0.9296064312044026 ------0.9327705367590278

Var :  0.9296161064797657  -------0.9327796643591775

Max :  1033.2645776367187------1012.6107690429687