In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [2]:
from google.colab import files
uploaded =files.upload()

Saving CO2 Emissions.csv to CO2 Emissions.csv


In [3]:
data = pd.read_csv('CO2 Emissions.csv')

#Had to drop Model columns as they are having a lot of unseen data for testing the models.And so, getting a very large Mean Squared Error (MSE).

data = data.drop([ 'Model'], axis=1)
categorical_col = ['Make','Vehicle Class', 'Transmission', 'Fuel Type']
numerical_col = ['Engine Size(L)', 'Cylinders','Fuel Consumption City (L/100 km)','Fuel Consumption Hwy (L/100 km)','Fuel Consumption Comb (L/100 km)','Fuel Consumption Comb (mpg)']

X = data[categorical_col + numerical_col]
y = data['CO2 Emissions(g/km)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

#One hot encoding for categorical columns. If found any unknown value during testing time then it will ignore it
onehot_encoder = OneHotEncoder(drop='first', sparse=False)
X_train_encoded = onehot_encoder.fit_transform(X_train[categorical_col])
X_test_encoded = onehot_encoder.transform(X_test[categorical_col])

X_train_encoded = np.concatenate((X_train_encoded, X_train[numerical_col].values), axis=1)
X_test_encoded = np.concatenate((X_test_encoded, X_test[numerical_col].values), axis=1)

# Feature scaling for the combined dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# fitting linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict Training and testing result
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

#Performance metrices for train and test data
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("One-Hot Encoding Results:")
print("Train Data Metrics:")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"R2 Score: {r2_train:.2f}")
print(f"MAE: {mae_train:.2f}")

print("\nTest Data Metrics:")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"R2 Score: {r2_test:.2f}")
print(f"MAE: {mae_test:.2f}")

One-Hot Encoding Results:
Train Data Metrics:
MSE: 21.89
RMSE: 4.68
R2 Score: 0.99
MAE: 2.89

Test Data Metrics:
MSE: 19.95
RMSE: 4.47
R2 Score: 0.99
MAE: 2.92




In [4]:
#Q3 f. Applying PCA on preprocessed data obtained from one-hot encoding.
from sklearn.decomposition import PCA

for n_components in [2,4,6,8,10]:

    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

   #fitting linear regression model
    model = LinearRegression()
    model.fit(X_train_pca, y_train)

    # Predict Training and testing result
    y_train_pred = model.predict(X_train_pca)
    y_test_pred = model.predict(X_test_pca)

    #Performance metrices for train and test data
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    print(f"\nNumber of Components: {n_components}")
    print("Train Data Metrics:")
    print(f"MSE: {mse_train:.2f}")
    print(f"RMSE: {rmse_train:.2f}")
    print(f"R2 Score: {r2_train:.2f}")
    print(f"MAE: {mae_train:.2f}")

    print("\nTest Data Metrics:")
    print(f"MSE: {mse_test:.2f}")
    print(f"RMSE: {rmse_test:.2f}")
    print(f"R2 Score: {r2_test:.2f}")
    print(f"MAE: {mae_test:.2f}")





Number of Components: 2
Train Data Metrics:
MSE: 553.28
RMSE: 23.52
R2 Score: 0.84
MAE: 16.20

Test Data Metrics:
MSE: 563.67
RMSE: 23.74
R2 Score: 0.83
MAE: 16.40

Number of Components: 4
Train Data Metrics:
MSE: 548.54
RMSE: 23.42
R2 Score: 0.84
MAE: 16.11

Test Data Metrics:
MSE: 562.12
RMSE: 23.71
R2 Score: 0.83
MAE: 16.26

Number of Components: 6
Train Data Metrics:
MSE: 538.08
RMSE: 23.20
R2 Score: 0.84
MAE: 15.84

Test Data Metrics:
MSE: 547.64
RMSE: 23.40
R2 Score: 0.83
MAE: 15.96

Number of Components: 8
Train Data Metrics:
MSE: 540.19
RMSE: 23.24
R2 Score: 0.84
MAE: 15.87

Test Data Metrics:
MSE: 551.15
RMSE: 23.48
R2 Score: 0.83
MAE: 16.00

Number of Components: 10
Train Data Metrics:
MSE: 515.37
RMSE: 22.70
R2 Score: 0.85
MAE: 15.65

Test Data Metrics:
MSE: 533.92
RMSE: 23.11
R2 Score: 0.84
MAE: 15.90
