In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error ,r2_score, mean_absolute_error


In [None]:
# Importing data

In [None]:
ds = pd.read_csv('CO2 Emissions_India.csv')

In [None]:
ds.head()

In [None]:
# Renaming columns

In [None]:
ds.rename(columns={'CO2 Emissions(g/km)':'CO2_emission'}, inplace=True)

In [None]:
ds.head()

# Exploratory Data Analysis

In [None]:
# Checking for the data types and null values

ds.info()

In [None]:
# Checking for total null values if any

ds.isnull().sum()

In [None]:
# Insight of different statistical distribution of features and label

ds.describe().T

In [None]:
# checking for unique variables
print(ds['Make'].unique())

In [None]:
# Putting different transmission sub-catagories into their respective catagories
ds['Transmission'] = np.where(ds['Transmission'].isin(['A4','A5','A6','A7','A8','A9','A10']),'Automatic',ds['Transmission'])
ds['Transmission'] = np.where(ds['Transmission'].isin(['AS4','AS5','AS6','AS7','AS8','AS9','AS10']),'Automatic of Selective type',ds['Transmission'])
ds['Transmission'] = np.where(ds['Transmission'].isin(['AM5','AM6','AM7','AM8','AM9']),'Automated Manual',ds['Transmission'])
ds['Transmission'] = np.where(ds['Transmission'].isin(['AV','AV6','AV7','AV8','AV10']),'CVT',ds['Transmission'])
ds['Transmission'] = np.where(ds['Transmission'].isin(['M5','M6','M7']),'Manual',ds['Transmission'])

print(ds['Transmission'].unique())

In [None]:
# Renaming fuel types for better understanding

print(ds['Fuel Type'].value_counts())

ds['Fuel Type']= np.where(ds['Fuel Type']=='X','Regular gasoline',ds['Fuel Type'])
ds['Fuel Type']= np.where(ds['Fuel Type']=='Z','Premium gasoline',ds['Fuel Type'])
ds['Fuel Type']= np.where(ds['Fuel Type']=='E','Ethanol',ds['Fuel Type'])
ds['Fuel Type']= np.where(ds['Fuel Type']=='D','Diesel',ds['Fuel Type'])
ds['Fuel Type']= np.where(ds['Fuel Type']=='N','Natural gas',ds['Fuel Type'])

print(ds['Fuel Type'].unique())

In [None]:
print(ds['Vehicle Class'].unique())

In [None]:
ds.shape

In [None]:
ds.head()

In [None]:
ds.corr()['CO2_emission'].sort_values()

In [None]:
# Correlation between features and label

ds.corr()

VISUALISATIONS

In [None]:
# VISUALISATIONS

corr = ds.corr()

plt.rcParams['figure.figsize']=(10,8)
sns.heatmap(corr, cmap='coolwarm', linewidth=0.5, fmt='0.2f', annot=True)

plt.title('Correlation')

In [None]:
# distribution of numerical features
# we can seee that the numerical features are little ight skewed.

ds.hist(figsize=(10,8),bins=50)

FREQUENCY DISTRIBUTION OF DIFFFRENT FEATURES

In [None]:
#Make
plt.figure(figsize=(20,5))

ds.groupby('Make')['Make'].count().sort_values(ascending=False).plot(kind='bar',color='red')

plt.title('Frequency distribution of cars of different companies', fontsize=25)
plt.xlabel('Company', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# MODEL
plt.figure(figsize=(20,5))

ds.groupby('Model')['Model'].count().sort_values(ascending=False)[:25].plot(kind='bar')

plt.title('Distribution of models', fontsize=25)
plt.xlabel('Models', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Vehicle Class

plt.figure(figsize=(20,5))

ds.groupby('Vehicle Class')['Vehicle Class'].count().sort_values(ascending=False).plot(kind='bar', color='green')

plt.title('Vehicle class distribution', fontsize=25)
plt.xlabel('Vehicle Class', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Transmission

plt.figure(figsize=(20,5))

ds.groupby('Transmission')['Transmission'].count().sort_values(ascending=False).plot(kind='bar', color='magenta')

plt.title('Distribution of transmission', fontsize=25)
plt.xlabel('Transmission type', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Fuel Type

plt.figure(figsize=(20,5))

ds.groupby('Fuel Type')['Fuel Type'].count().sort_values(ascending=False).plot(kind='bar')

plt.title(' Most frequently used Fuel type', fontsize=25)
plt.xlabel('Fuel Type', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Cylinders

plt.figure(figsize=(20,5))

ds.groupby('Cylinders')['Cylinders'].count().sort_values(ascending=True).plot(kind='bar', color='purple')

plt.title(' Cylinder', fontsize=25)
plt.xlabel('Cylinders', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

 FEATURE DISTRIBUTION WITH RESPECT TO CO2 EMISSION

In [None]:
# Visualisation wrt CO2 emision

plt.figure(figsize=(20,5))

ds.groupby('Make')['CO2_emission'].mean().sort_values(ascending=False)[:25].plot(kind='bar', color='red')

plt.title('Car brands wrt CO2 emission', fontsize=25)
plt.xlabel('Car brands', fontsize=20)
plt.ylabel('CO2 emission', fontsize=20)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,5))

ds.groupby('Model')['CO2_emission'].mean().sort_values(ascending=False)[:25].plot(kind='bar', color='blue')

plt.title(' Car models generating most of CO2', fontsize=25)
plt.xlabel(' Car Models', fontsize=20)
plt.ylabel('CO2 emission', fontsize=20)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,5))

ds.groupby('Vehicle Class')['CO2_emission'].mean().sort_values(ascending=False).plot(kind='bar',color='m')

plt.title('Vehicle class wrt CO2 emission', fontsize=25)
plt.xlabel('Vehicle class' , fontsize=20)
plt.ylabel('Co2 emission', fontsize=20)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,5))

ds.groupby('Fuel Type')['CO2_emission'].mean().sort_values(ascending=False).plot(kind='bar',color='g')

plt.title('Fuel type wrt CO2 emission', fontsize=25)
plt.xlabel('Fuel Type', fontsize=20)
plt.ylabel('Co2 emission', fontsize=20)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:

fuel_type = ds.groupby('Fuel Type')['CO2_emission'].median().sort_values(ascending=False).index
plt.figure(figsize=(10,8))
sns.boxplot(x = 'Fuel Type', y='CO2_emission', data =ds, order=fuel_type, width=0.5)
plt.xticks(rotation=45, horizontalalignment='center')

plt.show()

In [None]:
plt.figure(figsize=(20,5))

ds.groupby('Transmission')['CO2_emission'].mean().sort_values(ascending=False).plot(kind='bar')

plt.title('Transmission wrt CO2 emission', fontsize=25)
plt.xlabel('Transmission type', fontsize=20)
plt.ylabel('CO2 emission', fontsize=20)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x = 'Fuel Consumption Comb (mpg)', y='CO2_emission', data =ds[:25])

plt.xticks(rotation=45)
plt.show()

In [None]:

plt.figure(figsize=(10,8))

sns.catplot(x='Cylinders', y='Fuel Consumption Comb (mpg)',data = ds)

plt.show()  

In [None]:
plt.figure(figsize=(10,8))

sns.catplot(x='Cylinders', y='CO2_emission',data = ds)

plt.show()                                # co2 emission increases with increasing no of cylinders

In [None]:

plt.figure(figsize=(10,8))
sns.boxplot(x = 'Fuel Type', y='Fuel Consumption Comb (mpg)', data =ds)

plt.xticks([0,1,2,3,4],['Premium Gasoline','Disel', 'Regular Gasoline','Ethanol', 'Natural Gas'])
plt.show()                           # efficiency of fuel 

# DATA PREPROCESSING

In [None]:
# DATA PREPROCESSING

ds.head()

In [None]:
ds['Transmission'].value_counts()

In [None]:
ds['Fuel Type'].value_counts()

In [None]:
# Dropping natural gas as there is only one data we have which would not make much difference in modelling

ds_N = ds[ds['Fuel Type']== 'Natural gas']

ind = ds_N.index

ds_N

In [None]:
for i in ind:
    ds.drop(i, axis=0, inplace=True)     

In [None]:
ds[ds['Fuel Type']=='Natural gas']

In [None]:
# creating dummy variables of fuel type and transmission (catagorical features)

d_v =pd.get_dummies(ds['Fuel Type'], prefix='Fuel', drop_first=True)
dv = pd.get_dummies(ds["Transmission"], drop_first=True)
d_v.head()


In [None]:
dv.head()

In [None]:
df = [ds, d_v,dv]

data = pd.concat(df, axis=1)
data.head()

In [None]:
data.drop(['Fuel Type'], inplace=True, axis=1)
data.drop(['Transmission'], inplace=True, axis=1)

                         

HANDLING OTHER CATAGORICAL FEATURES HAVING MULTIPLE CATAGORIES (MAKE , MODEL, VEHICLE CLASS)

In [None]:
df_freq = data['Make'].value_counts().to_dict()
mod_freq = data['Model'].value_counts().to_dict()
veh_freq = data['Vehicle Class'].value_counts().to_dict()

In [None]:
data['Make'] = data['Make'].map(df_freq)
data['Model'] = data['Model'].map(mod_freq)
data['Vehicle Class'] = data['Vehicle Class'].map(veh_freq)

In [None]:
data.head()

DIVIDING DATA SET INTO INDEPENDENT AND DEPENDENT VARIABLE

In [None]:
X = data.drop('CO2_emission', axis=1)
y = data['CO2_emission']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
data.shape

FEATURE SELECTION USING CHI-SQUARE TEST

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
ranked_feature = SelectKBest(score_func = chi2, k='all')
ordered_feature = ranked_feature.fit(X, y)


In [None]:
top_feat = pd.DataFrame(ordered_feature.scores_ , columns=['score'])
top_feat['variables'] = X.columns

In [None]:
top_feat.sort_values(by='score', ascending=False)

CREATING TRAINING SET AND TESTING SET

In [None]:
# splitting of traing testing set into X and y

from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y_test.head()

FEATURE SCALING USING STANDARDIZATION 

In [None]:
# STANDARDIZATION
from sklearn.preprocessing import StandardScaler

In [None]:
scaler =StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

In [None]:
data['CO2_emission'].mean()

# MODEL IMPLEMENTATION (Approach 1)

# LINEAR REGRESSION

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.intercept_

In [None]:
model.coef_

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
r2_score(y_test, y_pred)

In [None]:
frames = [y_pred, y_test.values]
result_pred = pd.DataFrame(data=frames)
result_pred = result_pred.T

In [None]:
lin_pred = result_pred.rename(columns={0: 'pred_values',1:'real_values'})
lin_pred['pred_values'] = lin_pred['pred_values'].map(lambda x: round(x,2))

lin_pred

In [None]:
lin_pred['diff'] = abs(lin_pred['pred_values'] - lin_pred['real_values'])

print('mean diff: ', (abs(lin_pred['diff']).mean()))

In [None]:
lin_pred.head(10)

In [None]:
sns.displot(y_pred-y_test)

In [None]:
plt.scatter( y_test,y_pred)
plt.xlabel('y_test')
plt.ylabel('y_pred')

In [None]:
sns.displot(y_pred, bins=20,color='red')
plt.show()
sns.displot(data['CO2_emission'], bins=20)
plt.show()

# DECISION TREE REGRESSION

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state = 42)

model.fit(X_train, y_train)

In [None]:
dtr_pred = model.predict(X_test)
dtr_pred

In [None]:
np.sqrt(mean_squared_error(y_test,dtr_pred ))

In [None]:
r2_score(y_test, dtr_pred)

In [None]:
frames = [dtr_pred, y_test.values]
result_pred = pd.DataFrame(data=frames)
result_pred = result_pred.T
result_pred.head()

In [None]:
dtr_pred = result_pred.rename(columns={0: 'pred_values', 1:'real_values'})
dtr_pred['pred_values'] = (dtr_pred['pred_values'].map(lambda x: round(x,2)))

dtr_pred['diff'] = abs(dtr_pred['real_values'] -dtr_pred['pred_values'])


print('mean diff: ', abs(dtr_pred['diff']).mean())

In [None]:
dtr_pred.head(10)

# RANDOM FOREST

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

In [None]:
y_rf_pred = rf_model.predict(X_test)
y_rf_pred

In [None]:
print('RMSE: {:0.4f}'.format(np.sqrt(mean_squared_error(y_test,y_rf_pred))))
print('MAE: {:0.4f}'.format(mean_absolute_error(y_test,y_rf_pred)))
print('R2_score: {:0.4f}'.format(r2_score(y_test,y_rf_pred)))

In [None]:
frames = [y_rf_pred, y_test.values]
result_pred = pd.DataFrame(data=frames)
result_pred = result_pred.T
result_pred.head()

In [None]:
y_rf_pred = result_pred.rename(columns={0: 'pred_values', 1:'real_values'})
y_rf_pred['pred_values'] = (y_rf_pred['pred_values'].map(lambda x: round(x,2)))

y_rf_pred['diff'] = abs(y_rf_pred['real_values'] -y_rf_pred['pred_values'])


print('mean diff: ', abs(y_rf_pred['diff']).mean())

In [None]:
y_rf_pred.head(10)

# SIMPLE VECTOR MACHINE

In [None]:
from sklearn.svm import LinearSVR
model = LinearSVR()
model.fit(X_train, y_train)
y_svr_pred = model.predict(X_test)
y_svr_pred

In [None]:
np.sqrt(mean_squared_error(y_svr_pred,y_test))

In [None]:
r2_score(y_svr_pred,y_test)

In [None]:
frames = [y_svr_pred, y_test.values]
result_pred = pd.DataFrame(data=frames)
result_pred = result_pred.T
result_pred.head()

In [None]:
y_svr_pred = result_pred.rename(columns={0: 'pred_values', 1:'real_values'})
y_svr_pred['pred_values'] = (y_svr_pred['pred_values'].map(lambda x: round(x,2)))

y_svr_pred['diff'] = abs(y_svr_pred['real_values'] -y_svr_pred['pred_values'])


print('mean diff: ', abs(y_svr_pred['diff']).mean())

In [None]:
y_svr_pred.head(10)

#  APPROACH 2

In [None]:
models =['LinReg','DT', 'RF','SVR']

frame = pd.DataFrame(columns={'models':[],'rmse_train':[], 'mae_train':[],'r2_train':[] ,'rmse_test':[], 'mae_test':[],'r2_test':[]})




for i in range(len(models)):
    if models[i] == 'LinReg':
        model = LinearRegression()
        model.fit(X_train,y_train)
        pred_train = model.predict(X_train)
        rmse_train = np.sqrt(mean_squared_error(y_train,pred_train))
        mae_train = mean_absolute_error(y_train,pred_train)
        r2_train = r2_score(y_train,pred_train)
        
        pred_test = model.predict(X_test)
        rmse_test = np.sqrt(mean_squared_error(y_test,pred_test))
        mae_test = mean_absolute_error(y_test,pred_test)
        r2_test = r2_score(y_test,pred_test)
        frame.loc[frame.shape[0]] = ['Linear Regression', rmse_train, mae_train,r2_train,rmse_test,mae_test,r2_test]
        
        
    elif models[i] =='DT':
        model = DecisionTreeRegressor()
        model.fit(X_train,y_train)
        pred_train = model.predict(X_train)
        rmse_train = np.sqrt(mean_squared_error(y_train,pred_train))
        mae_train = mean_absolute_error(y_train,pred_train)
        r2_train = r2_score(y_train,pred_train)
        
        pred_test = model.predict(X_test)
        rmse_test = np.sqrt(mean_squared_error(y_test,pred_test))
        mae_test = mean_absolute_error(y_test,pred_test)
        r2_test = r2_score(y_test,pred_test)
        frame.loc[frame.shape[0]] = ['Decision Tree Regression',rmse_train, mae_train,r2_train,rmse_test,mae_test,r2_test]
        
    elif models[i] =='RF':
        model = RandomForestRegressor()
        model.fit(X_train,y_train)
        pred_train = model.predict(X_train)
        rmse_train = np.sqrt(mean_squared_error(y_train,pred_train))
        mae_train = mean_absolute_error(y_train,pred_train)
        r2_train = r2_score(y_train,pred_train)
       
        pred_test = model.predict(X_test)
        rmse_test = np.sqrt(mean_squared_error(y_test,pred_test))
        mae_test = mean_absolute_error(y_test,pred_test)
        r2_test = r2_score(y_test,pred_test)
        frame.loc[frame.shape[0]] = ['Random Forest Regression',rmse_train, mae_train,r2_train,rmse_test,mae_test,r2_test]
        
    else :
        models[i] =='SVM'
        model = LinearSVR()
        model.fit(X_train,y_train)
        pred_train = model.predict(X_train)
        rmse_train = np.sqrt(mean_squared_error(y_train,pred_train))
        mae_train = mean_absolute_error(y_train,pred_train)
        r2_train = r2_score(y_train,pred_train)
        
        pred_test = model.predict(X_test)
        rmse_test = np.sqrt(mean_squared_error(y_test,pred_test))
        mae_test = mean_absolute_error(y_test,pred_test)
        r2_test = r2_score(y_test,pred_test)
        frame.loc[frame.shape[0]] = ['Simple Vector Regression',rmse_train, mae_train,r2_train,rmse_test,mae_test,r2_test]

OVERALL PERFORMANCE OF ALL MODELS IN A DATAFRAME

In [None]:
frame