In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
# load the CO2 Emission dataset 

CO2Emission = pd.read_csv(r"C:\Users\Admin\Downloads\files\13,14-08-22\MY2021_Fuel_Consumption_Ratings (1).csv", header=0)

# Copy the file to back-up file

CO2Emission_bk = CO2Emission.copy()

# Display first 5 records

CO2Emission.head()

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating
0,2021,Acura,ILX,Compact,2.4,4,AM8,Z,9.9,7.0,8.6,33,199,6,3
1,2021,Acura,NSX,Two-seater,3.5,6,AM9,Z,11.1,10.8,11.0,26,256,4,3
2,2021,Acura,RDX SH-AWD,SUV: Small,2.0,4,AS10,Z,11.0,8.6,9.9,29,232,5,6
3,2021,Acura,RDX SH-AWD A-SPEC,SUV: Small,2.0,4,AS10,Z,11.3,9.1,10.3,27,242,5,6
4,2021,Acura,TLX SH-AWD,Compact,2.0,4,AS10,Z,11.2,8.0,9.8,29,230,5,7


In [3]:
# Display dataset information

CO2Emission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 15 columns):
Year                         883 non-null int64
Make                         883 non-null object
Model                        883 non-null object
Vehicle_Class                883 non-null object
Engine_Size                  883 non-null float64
Cylinders                    883 non-null int64
Transmission                 883 non-null object
Fuel_Type                    883 non-null object
Fuel_Consumption_city        883 non-null float64
Fuel_Consumption_Hwy         883 non-null float64
Fuel_Consumption_Comb        883 non-null float64
Fuel_Consumption_Comb_MPG    883 non-null int64
CO2_Emissions                883 non-null int64
CO2_Rating                   883 non-null int64
Smog_Rating                  883 non-null int64
dtypes: float64(4), int64(6), object(5)
memory usage: 103.6+ KB


In [4]:
# Delecting the 8 columns

CO2Emission = CO2Emission.drop(['Year', 'Make', 'Model', 'Vehicle_Class', 'Transmission',
                                 'Fuel_Type', 'CO2_Rating', 'Smog_Rating'], axis = 1)
CO2Emission.head()

Unnamed: 0,Engine_Size,Cylinders,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions
0,2.4,4,9.9,7.0,8.6,33,199
1,3.5,6,11.1,10.8,11.0,26,256
2,2.0,4,11.0,8.6,9.9,29,232
3,2.0,4,11.3,9.1,10.3,27,242
4,2.0,4,11.2,8.0,9.8,29,230


In [5]:
# Identify the Independent and Target variables

IndepVar = []
for col in CO2Emission.columns:
    if col != 'CO2_Emissions':
        IndepVar.append(col)

TargetVar = 'CO2_Emissions'

x = CO2Emission[IndepVar]
y = CO2Emission[TargetVar]

In [6]:
# Split the data into train and test

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=143)

# Display the shape of the train_data and test_data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((618, 6), (265, 6), (618,), (265,))

In [7]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

# Multi Regression Algorithm

In [8]:
# Build the model with Gradient Boosting Regressor

from sklearn.linear_model import LinearRegression  

# Create object for the model

ModelMLR = LinearRegression()

# Train the model with training data

ModelMLR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = ModelMLR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 22.98
Mean Squared Error (MSE): 881.506
Root Mean Squared Error (RMSE): 29.69
R2_score: 0.760523
Root Mean Squared Log Error (RMSLE): 3.391
Mean Absolute Percentage Error (MAPE): 8.392 %
Adj R Square:  0.758883


In [9]:
Results = pd.DataFrame({'CO2_Emissions_A':y_test, 'CO2_Emissions_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = CO2Emission_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating,CO2_Emissions_A,CO2_Emissions_P
128,2021,Buick,Encore (SIDI),SUV: Small,1.4,4,AS6,X,9.7,7.3,8.6,33,203,6,7,203,211.830669
304,2021,Ford,Mustang Convertible (High Performance),Subcompact,2.3,4,M6,X,12.4,9.3,11.0,26,258,4,5,258,268.650791
780,2021,Rolls-Royce,Cullinan Black Badge,Station wagon: Mid-size,6.7,12,AS8,Z,20.1,12.1,16.5,17,386,1,3,386,453.312952
92,2021,BMW,M8 Cabriolet Competition,Subcompact,4.4,8,AS8,Z,16.0,11.0,13.8,20,323,3,3,323,364.129504
193,2021,Chevrolet,Silverado 4WD,Pickup truck: Standard,3.0,6,A10,D,10.6,9.2,10.0,28,268,4,3,268,259.221853


# Compare with all Regression / Regressors

In [11]:
# Load the result dataset

RGRResults = pd.read_csv(r"C:\Users\Admin\Downloads\files\13,14-08-22\RGRResults.csv", header=0)

RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score


In [12]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.svm import SVR
#import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
#from sklearn.linear_model import BayesianRidge
#from sklearn.ensemble import BaggingRegressor
#from sklearn.ensemble import GradientBoostingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

ModelMLR= LinearRegression()
ModelDCR = DecisionTreeRegressor()
ModelRFR = RandomForestRegressor()
ModelETR = ExtraTreesRegressor()
#modelSVR = SVR()
#modelXGR = xgb.XGBRegressor()
ModelKNN = KNeighborsRegressor(n_neighbors=5)
#modelBRR = BayesianRidge()
#modelBGR = BaggingRegressor()
#modelGBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
#                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
#                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
#                                     min_impurity_split=None, init=None, random_state=None, max_features=None,
#                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
#                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

# Evalution matrix for all the algorithms

#MM = [modelmlg, modeldcr, modelrfr, modelSVR, modelXGR, modelKNN, modelETR, modelBRR, modelBGR, modelGBR]
MM = [ModelMLR, ModelDCR, ModelRFR, ModelETR, ModelKNN]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RGRResults = RGRResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 22.98
Mean Squared Error (MSE): 881.506
Root Mean Squared Error (RMSE): 29.69
R2_score: 0.760523
Root Mean Squared Log Error (RMSLE): 3.391
Mean Absolute Percentage Error (MAPE): 8.39 %
Adj R Square:  0.758883
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 27.334
Mean Squared Error (MSE): 1347.556
Root Mean Squared Error (RMSE): 36.709
R2_score: 0.633911
Root Mean Squared Log Error (RMSLE): 3.603
Mean Absolute Percentage Error (MAPE): 10.04 %
Adj R Square:  0.631404
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 19.926
Mean Squared Error (MSE): 663.283
Root Mean Squared Error (RMSE): 25.754
R2_score: 0.819807
Root Mean Squared Log Error (RMSLE): 3.249
Mean Absolute Percentage Erro

In [13]:
# Results with comparing the all the algorithms 

#RGRResults.to_csv("D://00 Henotic//SRKR//Datasets//REsults//RGRResults.csv")

RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
0,LinearRegression(),22.979788,0.758883,29.690159,8.39178,881.505528,3.390816,0.760523
1,DecisionTreeRegressor(),27.333962,0.631404,36.709068,10.041095,1347.55566,3.603024,0.633911
2,"(DecisionTreeRegressor(max_features='auto', ra...",19.926151,0.818573,25.754284,7.45065,663.283164,3.248601,0.819807
3,"(ExtraTreeRegressor(random_state=1600428049), ...",16.525302,0.874942,21.382215,5.979942,457.199111,3.062559,0.875793
4,KNeighborsRegressor(),22.273208,0.788379,27.814838,8.010371,773.665208,3.32557,0.789819


In [14]:
# Predict the values with ET algorithm

y_predF = ModelETR.predict(x_test)

In [15]:
Results = pd.DataFrame({'CO2_Emissions_A':y_test, 'CO2_Emissions_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = CO2Emission_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating,CO2_Emissions_A,CO2_Emissions_P
562,2021,Mazda,CX-30 Turbo 4WD,SUV: Small,2.5,4,AS6,X,10.5,7.9,9.3,30,220,5,3,220,233.8
464,2021,Jeep,Renegade,SUV: Small,2.4,4,A9,X,10.8,7.8,9.5,30,222,5,6,222,234.8
584,2021,Mercedes-Benz,A 220 4MATIC Sedan,Subcompact,2.0,4,AM7,Z,9.6,6.9,8.4,34,197,6,5,197,199.6
397,2021,Honda,HR-V AWD,Station wagon: Small,1.8,4,AV,X,8.8,7.5,8.2,34,193,6,5,193,202.2
304,2021,Ford,Mustang Convertible (High Performance),Subcompact,2.3,4,M6,X,12.4,9.3,11.0,26,258,4,5,258,262.0


In [16]:
# Calculate the %of Error

ResultsFinal['%Error'] = round(((ResultsFinal['CO2_Emissions_A']-ResultsFinal['CO2_Emissions_P'])/ResultsFinal['CO2_Emissions_A'])*100,3)

In [17]:
# Display the results

ResultsFinal.sample(5)

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating,CO2_Emissions_A,CO2_Emissions_P,%Error
814,2021,Toyota,Camry TRD,Mid-size,3.5,6,AS8,X,10.8,7.6,9.4,30,220,5,5,220,237.4,-7.909
732,2021,Porsche,911 Carrera 4S Cabriolet,Minicompact,3.0,6,M7,Z,13.8,9.8,12.0,24,281,4,5,281,304.8,-8.47
821,2021,Toyota,Corolla,Compact,1.8,4,M6,X,8.0,6.0,7.1,40,165,7,5,165,164.6,0.242
340,2021,GMC,Canyon 4WD,Pickup truck: Small,2.8,4,A6,D,12.2,8.4,10.5,27,294,3,3,294,260.6,11.361
322,2021,Ford,Transit Connect Wagon LWB,Special purpose vehicle,2.5,4,AS6,X,12.1,9.0,10.7,26,251,4,5,251,267.8,-6.693
