## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("dataset.csv")

In [3]:
data.head()

Unnamed: 0,Year,Month,Sector,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Total Renewable Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel
0,1973,1,Commerical,0.0,0.0,0.0,0.0,0.57,0.0,0.0,0.0,0.57,0.57,0.0,0.0,0.0,0.0
1,1973,1,Electric Power,0.0,0.49,0.0,0.0,0.054,0.157,0.0,0.0,0.211,89.223,0.0,0.0,88.522,0.0
2,1973,1,Industrial,1.04,0.0,0.0,0.0,98.933,0.0,0.0,0.0,98.933,99.973,0.0,0.0,0.0,0.0
3,1973,1,Residential,0.0,0.0,0.0,0.0,30.074,0.0,0.0,0.0,0.0,30.074,0.0,0.0,0.0,0.0
4,1973,1,Transportation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Taking care of duplicate observations if present over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [5]:
data.isnull().sum()

Year                                  0
Month                                 0
Sector                                0
Hydroelectric Power                   0
Geothermal Energy                     0
Solar Energy                          0
Wind Energy                           0
Wood Energy                           0
Waste Energy                          0
Fuel Ethanol, Excluding Denaturant    0
Biomass Losses and Co-products        0
Biomass Energy                        0
Total Renewable Energy                0
Renewable Diesel Fuel                 0
Other Biofuels                        0
Conventional Hydroelectric Power      0
Biodiesel                             0
dtype: int64

In [6]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Year
Month
Hydroelectric Power
Geothermal Energy
Solar Energy
Wind Energy
Wood Energy
Waste Energy
Fuel Ethanol, Excluding Denaturant
Biomass Losses and Co-products
Biomass Energy
Total Renewable Energy
Renewable Diesel Fuel
Other Biofuels
Conventional Hydroelectric Power
Biodiesel


In [8]:
data[numerical_features]

Unnamed: 0,Year,Month,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Total Renewable Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel
0,1973,1,0.000,0.000,0.000,0.000,0.570,0.000,0.000,0.000,0.570,0.570,0.00,0.000,0.000,0.000
1,1973,1,0.000,0.490,0.000,0.000,0.054,0.157,0.000,0.000,0.211,89.223,0.00,0.000,88.522,0.000
2,1973,1,1.040,0.000,0.000,0.000,98.933,0.000,0.000,0.000,98.933,99.973,0.00,0.000,0.000,0.000
3,1973,1,0.000,0.000,0.000,0.000,30.074,0.000,0.000,0.000,0.000,30.074,0.00,0.000,0.000,0.000
4,1973,1,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,2024,1,0.073,1.669,4.267,0.036,7.053,6.233,2.441,0.000,15.728,21.773,0.00,0.000,0.000,0.000
3061,2024,1,0.000,4.667,32.707,119.265,15.071,13.873,0.000,0.000,28.944,257.661,0.00,0.000,72.078,0.000
3062,2024,1,0.308,0.356,0.987,0.035,104.878,14.171,1.533,67.742,188.325,190.011,0.00,0.000,0.000,0.000
3063,2024,1,0.000,3.354,14.897,0.000,34.065,0.000,0.000,0.000,0.000,52.316,0.00,0.000,0.000,0.000


## Filtering the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Sector


In [10]:
data[cat_features]

Unnamed: 0,Sector
0,Commerical
1,Electric Power
2,Industrial
3,Residential
4,Transportation
...,...
3060,Commerical
3061,Electric Power
3062,Industrial
3063,Residential


## Encoding the categorical features over here

In [11]:
data['Sector'].value_counts()

Sector
Commerical        613
Electric Power    613
Industrial        613
Residential       613
Transportation    613
Name: count, dtype: int64

## Using mapping to convert the categories into the numerical features over here

In [12]:
Sector_mapping={}
for index,Sector in enumerate(data['Sector'].unique()):
  Sector_mapping[Sector]=index
print(Sector_mapping)

{'Commerical': 0, 'Electric Power': 1, 'Industrial': 2, 'Residential': 3, 'Transportation': 4}


In [13]:
data['Sector']=data['Sector'].map(Sector_mapping)

## We need to treat year feature as well as a categorical feature over here so therefore encoding that as well

In [14]:
data['Year'].value_counts()

Year
1973    60
1974    60
2001    60
2002    60
2003    60
2004    60
2005    60
2006    60
2007    60
2008    60
2009    60
2010    60
2011    60
2012    60
2013    60
2014    60
2015    60
2016    60
2017    60
2018    60
2019    60
2020    60
2021    60
2022    60
2023    60
2000    60
1999    60
1998    60
1985    60
1975    60
1976    60
1977    60
1978    60
1979    60
1980    60
1981    60
1982    60
1983    60
1984    60
1986    60
1997    60
1987    60
1988    60
1989    60
1990    60
1991    60
1992    60
1993    60
1994    60
1995    60
1996    60
2024     5
Name: count, dtype: int64

In [15]:
Year_mapping={}

for index,Year in enumerate(data['Year'].unique()):
  Year_mapping[Year]=index
print(Year_mapping)

{1973: 0, 1974: 1, 1975: 2, 1976: 3, 1977: 4, 1978: 5, 1979: 6, 1980: 7, 1981: 8, 1982: 9, 1983: 10, 1984: 11, 1985: 12, 1986: 13, 1987: 14, 1988: 15, 1989: 16, 1990: 17, 1991: 18, 1992: 19, 1993: 20, 1994: 21, 1995: 22, 1996: 23, 1997: 24, 1998: 25, 1999: 26, 2000: 27, 2001: 28, 2002: 29, 2003: 30, 2004: 31, 2005: 32, 2006: 33, 2007: 34, 2008: 35, 2009: 36, 2010: 37, 2011: 38, 2012: 39, 2013: 40, 2014: 41, 2015: 42, 2016: 43, 2017: 44, 2018: 45, 2019: 46, 2020: 47, 2021: 48, 2022: 49, 2023: 50, 2024: 51}


In [16]:
data['Year']=data['Year'].map(Year_mapping)

In [17]:
data

Unnamed: 0,Year,Month,Sector,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Total Renewable Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel
0,0,1,0,0.000,0.000,0.000,0.000,0.570,0.000,0.000,0.000,0.570,0.570,0.00,0.000,0.000,0.000
1,0,1,1,0.000,0.490,0.000,0.000,0.054,0.157,0.000,0.000,0.211,89.223,0.00,0.000,88.522,0.000
2,0,1,2,1.040,0.000,0.000,0.000,98.933,0.000,0.000,0.000,98.933,99.973,0.00,0.000,0.000,0.000
3,0,1,3,0.000,0.000,0.000,0.000,30.074,0.000,0.000,0.000,0.000,30.074,0.00,0.000,0.000,0.000
4,0,1,4,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,51,1,0,0.073,1.669,4.267,0.036,7.053,6.233,2.441,0.000,15.728,21.773,0.00,0.000,0.000,0.000
3061,51,1,1,0.000,4.667,32.707,119.265,15.071,13.873,0.000,0.000,28.944,257.661,0.00,0.000,72.078,0.000
3062,51,1,2,0.308,0.356,0.987,0.035,104.878,14.171,1.533,67.742,188.325,190.011,0.00,0.000,0.000,0.000
3063,51,1,3,0.000,3.354,14.897,0.000,34.065,0.000,0.000,0.000,0.000,52.316,0.00,0.000,0.000,0.000


## Creating the features and labels over here

In [18]:
data['TotalRenewableEnergy']=data['Total Renewable Energy']

In [19]:
data.drop("Total Renewable Energy",axis=1,inplace=True)

In [20]:
data

Unnamed: 0,Year,Month,Sector,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel,TotalRenewableEnergy
0,0,1,0,0.000,0.000,0.000,0.000,0.570,0.000,0.000,0.000,0.570,0.00,0.000,0.000,0.000,0.570
1,0,1,1,0.000,0.490,0.000,0.000,0.054,0.157,0.000,0.000,0.211,0.00,0.000,88.522,0.000,89.223
2,0,1,2,1.040,0.000,0.000,0.000,98.933,0.000,0.000,0.000,98.933,0.00,0.000,0.000,0.000,99.973
3,0,1,3,0.000,0.000,0.000,0.000,30.074,0.000,0.000,0.000,0.000,0.00,0.000,0.000,0.000,30.074
4,0,1,4,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,51,1,0,0.073,1.669,4.267,0.036,7.053,6.233,2.441,0.000,15.728,0.00,0.000,0.000,0.000,21.773
3061,51,1,1,0.000,4.667,32.707,119.265,15.071,13.873,0.000,0.000,28.944,0.00,0.000,72.078,0.000,257.661
3062,51,1,2,0.308,0.356,0.987,0.035,104.878,14.171,1.533,67.742,188.325,0.00,0.000,0.000,0.000,190.011
3063,51,1,3,0.000,3.354,14.897,0.000,34.065,0.000,0.000,0.000,0.000,0.00,0.000,0.000,0.000,52.316


In [24]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Experiment with different Regression algorithms and selecting the one giving more accuracy with respect to performance matrix over here

In [28]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

regressors = {
    "Random Forest": RandomForestRegressor(),
    "Support Vector Machine": SVR(kernel='linear'),
    "Linear Regression": LinearRegression(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    # "Decision Tree": DecisionTreeRegressor(),
    "Bayesian Ridge": BayesianRidge(),
    "XGBoost": XGBRegressor(),
    "CatBoost": CatBoostRegressor(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

results = {}
for name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

best_regressor = min(results, key=results.get)
best_mse = results[best_regressor]

print("Best Regressor:", best_regressor)
print("Mean Squared Error:", best_mse)


Learning rate set to 0.047176
0:	learn: 68.8772833	total: 51.3ms	remaining: 51.3s
1:	learn: 65.8759442	total: 54.8ms	remaining: 27.4s
2:	learn: 63.0121448	total: 58.9ms	remaining: 19.6s
3:	learn: 60.4305229	total: 64.8ms	remaining: 16.1s
4:	learn: 57.9147708	total: 74.3ms	remaining: 14.8s
5:	learn: 55.4418640	total: 78.7ms	remaining: 13s
6:	learn: 53.1173618	total: 82.3ms	remaining: 11.7s
7:	learn: 50.9020570	total: 86ms	remaining: 10.7s
8:	learn: 48.7279542	total: 89.6ms	remaining: 9.86s
9:	learn: 46.6586318	total: 93.1ms	remaining: 9.22s
10:	learn: 44.7291529	total: 96.5ms	remaining: 8.68s
11:	learn: 42.8379365	total: 100ms	remaining: 8.23s
12:	learn: 41.0964222	total: 104ms	remaining: 7.87s
13:	learn: 39.3560787	total: 107ms	remaining: 7.55s
14:	learn: 37.8045918	total: 111ms	remaining: 7.27s
15:	learn: 36.3595451	total: 120ms	remaining: 7.4s
16:	learn: 34.8829625	total: 131ms	remaining: 7.6s
17:	learn: 33.4928992	total: 135ms	remaining: 7.36s
18:	learn: 32.2346403	total: 141ms	rema

## Training the model on the training set with the best algorithm over here

In [31]:
regressor=Ridge()
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing set over here

In [32]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[  1.93   1.78]
 [171.52 171.47]
 [141.7  141.56]
 ...
 [ 76.99  77.08]
 [ 67.63  67.61]
 [126.34 126.51]]


In [36]:
from sklearn.metrics import r2_score,mean_squared_error
mean_squared_error(y_test,y_pred)

0.13931338106291644

In [37]:
r2_score(y_test,y_pred)

0.9999699488995157

In [39]:
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(actual_vs_predicted)

      Actual   Predicted
0      1.779    1.926688
1    171.471  171.518090
2    141.561  141.704636
3      0.000   -0.038549
4     47.661   47.731307
..       ...         ...
608   54.036   54.216673
609   51.385   51.589801
610   77.077   76.992076
611   67.613   67.631901
612  126.512  126.344552

[613 rows x 2 columns]


In [40]:
actual_vs_predicted['Absolute Difference'] = abs(actual_vs_predicted['Actual'] - actual_vs_predicted['Predicted'])

In [41]:
actual_vs_predicted

Unnamed: 0,Actual,Predicted,Absolute Difference
0,1.779,1.926688,0.147688
1,171.471,171.518090,0.047090
2,141.561,141.704636,0.143636
3,0.000,-0.038549,0.038549
4,47.661,47.731307,0.070307
...,...,...,...
608,54.036,54.216673,0.180673
609,51.385,51.589801,0.204801
610,77.077,76.992076,0.084924
611,67.613,67.631901,0.018901
