In [1]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [3]:
from google.colab import files
uploaded =files.upload()

Saving CO2 Emissions.csv to CO2 Emissions.csv


In [10]:
# Q3. Part c :Label Based Encoding . Then calculating MSE, RMSE, R2 score, Adjusted R2 score, MAE on Training and Testing data
data = pd.read_csv('CO2 Emissions.csv')

categorical_col = ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']
numerical_col = ['Engine Size(L)', 'Cylinders','Fuel Consumption City (L/100 km)','Fuel Consumption Hwy (L/100 km)','Fuel Consumption Comb (L/100 km)','Fuel Consumption Comb (mpg)']

X = data[categorical_col + numerical_col]
y = data['CO2 Emissions(g/km)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

#Label encoder on (training and testing data) for categorical features
label_encoder = LabelEncoder()
for col in categorical_col:
    combined_data = pd.concat([X_train[col], X_test[col]], axis=0)
    label_encoder.fit(combined_data)
    X_train[col] = label_encoder.transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])

#Feature scaling for numerical columns
scaler = StandardScaler()
X_train[numerical_col] = scaler.fit_transform(X_train[numerical_col])
X_test[numerical_col] = scaler.transform(X_test[numerical_col])

#fitting linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict Training and testing result
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

#Performance metrices for train and test data
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("Train Data Metrics:")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"R2 Score: {r2_train:.2f}")
print(f"MAE: {mae_train:.2f}")

print("\nTest Data Metrics:")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"R2 Score: {r2_test:.2f}")
print(f"MAE: {mae_test:.2f}")


Train Data Metrics:
MSE: 285.59
RMSE: 16.90
R2 Score: 0.92
MAE: 11.01

Test Data Metrics:
MSE: 296.58
RMSE: 17.22
R2 Score: 0.91
MAE: 11.22


In [12]:
#Q3. Part d : Performing PCA after doing Label Based Encoding .Then varying no of components and  calculating MSE, RMSE, R2 score, Adjusted R2 score, MAE on Training and Testing data
from sklearn.decomposition import PCA

for n_components in [2, 4, 6,8,10]:

    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    #fitting linear regression model
    model = LinearRegression()
    model.fit(X_train_pca, y_train)

    # Predict Training and testing result
    y_train_pred = model.predict(X_train_pca)
    y_test_pred = model.predict(X_test_pca)

    #Performance metrices for train and test data
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    print(f"\nNumber of Components: {n_components}")
    print("TRAIN DATA METRICES :")
    print(f"MSE: {mse_train:.2f}")
    print(f"RMSE: {rmse_train:.2f}")
    print(f"R2 Score: {r2_train:.2f}")
    print(f"MAE: {mae_train:.2f}")

    print("\nTest DATA METRICES:")
    print(f"MSE: {mse_test:.2f}")
    print(f"RMSE: {rmse_test:.2f}")
    print(f"R2 Score: {r2_test:.2f}")
    print(f"MAE: {mae_test:.2f}")


Number of Components: 2
TRAIN DATA METRICES :
MSE: 3285.24
RMSE: 57.32
R2 Score: 0.05
MAE: 45.26

Test DATA METRICES:
MSE: 3106.68
RMSE: 55.74
R2 Score: 0.06
MAE: 43.97

Number of Components: 4
TRAIN DATA METRICES :
MSE: 2758.28
RMSE: 52.52
R2 Score: 0.20
MAE: 40.93

Test DATA METRICES:
MSE: 2627.42
RMSE: 51.26
R2 Score: 0.20
MAE: 39.73

Number of Components: 6
TRAIN DATA METRICES :
MSE: 306.78
RMSE: 17.52
R2 Score: 0.91
MAE: 12.11

Test DATA METRICES:
MSE: 314.19
RMSE: 17.73
R2 Score: 0.90
MAE: 12.28

Number of Components: 8
TRAIN DATA METRICES :
MSE: 286.56
RMSE: 16.93
R2 Score: 0.92
MAE: 11.03

Test DATA METRICES:
MSE: 298.13
RMSE: 17.27
R2 Score: 0.91
MAE: 11.27

Number of Components: 10
TRAIN DATA METRICES :
MSE: 285.67
RMSE: 16.90
R2 Score: 0.92
MAE: 11.01

Test DATA METRICES:
MSE: 296.45
RMSE: 17.22
R2 Score: 0.91
MAE: 11.22


In [15]:
#Q3. g. Doing L1 & L2 regularization
from sklearn.linear_model import Lasso, Ridge

#L1 (Lasso) regularization
lasso_m = Lasso(alpha=1.0)
lasso_m.fit(X_train, y_train)
lasso_pred = lasso_m.predict(X_test)

#L2 (Ridge) regularization
ridge_m = Ridge(alpha=1.0)
ridge_m.fit(X_train, y_train)
ridge_pred = ridge_m.predict(X_test)

#Performance metrices for train and test data Lasso regularization
l_mse = mean_squared_error(y_test, lasso_pred)
l_rmse = np.sqrt(l_mse)
l_r2 = r2_score(y_test, lasso_pred)
l_adjusted_r2 = 1 - (1 - l_r2) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
l_mae = mean_absolute_error(y_test, lasso_pred)

#Performance metrices for train and test data Ridge regularization
r_mse = mean_squared_error(y_test, ridge_pred)
r_rmse = np.sqrt(r_mse)
r_r2 = r2_score(y_test, ridge_pred)
r_adjusted_r2 = 1 - (1 - r_r2) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
r_mae = mean_absolute_error(y_test, ridge_pred)

# Print the results
print("\nLasso Regression Metrics:")
print(f"MSE: {l_mse:.2f}")
print(f"RMSE: {l_rmse:.2f}")
print(f"R-squared: {l_r2:.2f}")
print(f"Adjusted R-squared: {l_adjusted_r2:.2f}")
print(f"MAE: {l_mae:.2f}")

print("\nRidge Regression Metrics:")
print(f"MSE: {r_mse:.2f}")
print(f"RMSE: {r_rmse:.2f}")
print(f"R-squared: {r_r2:.2f}")
print(f"Adjusted R-squared: {r_adjusted_r2:.2f}")
print(f"MAE: {r_mae:.2f}")


Lasso Regression Metrics:
MSE: 296.66
RMSE: 17.22
R-squared: 0.91
Adjusted R-squared: 0.91
MAE: 11.29

Ridge Regression Metrics:
MSE: 296.49
RMSE: 17.22
R-squared: 0.91
Adjusted R-squared: 0.91
MAE: 11.22


  model = cd_fast.enet_coordinate_descent(


In [16]:
#Q3. h. SGD Regressor and calculating MSE, RMSE, R2 score, Adjusted R2 score, MAE on Training and Testing data
from sklearn.linear_model import SGDRegressor

sgd_m = SGDRegressor(max_iter=1000, random_state=40)
sgd_m.fit(X_train, y_train)
sgd_pred = sgd_m.predict(X_test)

#Performance metrices for train and test data
sgd_mse = mean_squared_error(y_test, sgd_pred)
sgd_rmse = np.sqrt(sgd_mse)
sgd_r2 = r2_score(y_test, sgd_pred)
sgd_mae = mean_absolute_error(y_test, sgd_pred)

print("SGDRegressor Metrics:")
print(f"MSE: {sgd_mse:.2f}")
print(f"RMSE: {sgd_rmse:.2f}")
print(f"R2 Score: {sgd_r2:.2f}")
print(f"MAE: {sgd_mae:.2f}")

SGDRegressor Metrics:
MSE: 512766196426194926597933367296.00
RMSE: 716076948676743.12
R2 Score: -155391393010968472550637568.00
MAE: 621530874521764.12
