In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [6]:
# Load dataset
data = pd.read_csv('ToyotaCorolla - MLR.csv')

In [8]:
# Step 1: Exploratory Data Analysis (EDA)
print("Data Info:\n", data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Price      1436 non-null   int64 
 1   Age_08_04  1436 non-null   int64 
 2   KM         1436 non-null   int64 
 3   Fuel_Type  1436 non-null   object
 4   HP         1436 non-null   int64 
 5   Automatic  1436 non-null   int64 
 6   cc         1436 non-null   int64 
 7   Doors      1436 non-null   int64 
 8   Cylinders  1436 non-null   int64 
 9   Gears      1436 non-null   int64 
 10  Weight     1436 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 123.5+ KB
Data Info:
 None


In [10]:
# Data Summary
print("\nData Summary:\n", data.describe())


Data Summary:
               Price    Age_08_04             KM           HP    Automatic  \
count   1436.000000  1436.000000    1436.000000  1436.000000  1436.000000   
mean   10730.824513    55.947075   68533.259749   101.502089     0.055710   
std     3626.964585    18.599988   37506.448872    14.981080     0.229441   
min     4350.000000     1.000000       1.000000    69.000000     0.000000   
25%     8450.000000    44.000000   43000.000000    90.000000     0.000000   
50%     9900.000000    61.000000   63389.500000   110.000000     0.000000   
75%    11950.000000    70.000000   87020.750000   110.000000     0.000000   
max    32500.000000    80.000000  243000.000000   192.000000     1.000000   

                cc        Doors  Cylinders        Gears      Weight  
count   1436.00000  1436.000000     1436.0  1436.000000  1436.00000  
mean    1576.85585     4.033426        4.0     5.026462  1072.45961  
std      424.38677     0.952677        0.0     0.188510    52.64112  
min     13

In [12]:
# Checking unique values in categorical columns
print("\nUnique values in Fuel_Type:\n", data['Fuel_Type'].unique())
print("Unique values in Automatic:\n", data['Automatic'].unique())


Unique values in Fuel_Type:
 ['Diesel' 'Petrol' 'CNG']
Unique values in Automatic:
 [0 1]


In [14]:
# Step 2: Data Preprocessing
data = pd.get_dummies(data, columns=['Fuel_Type'], drop_first=True)

In [16]:
# Splitting the dataset into features and target variable
X = data.drop(['Price'], axis=1)
y = data['Price']

In [18]:
# Verify the dataset after encoding
print("\nX columns after encoding:\n", X.columns)
print("\nData types after encoding:\n", X.dtypes)


X columns after encoding:
 Index(['Age_08_04', 'KM', 'HP', 'Automatic', 'cc', 'Doors', 'Cylinders',
       'Gears', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol'],
      dtype='object')

Data types after encoding:
 Age_08_04           int64
KM                  int64
HP                  int64
Automatic           int64
cc                  int64
Doors               int64
Cylinders           int64
Gears               int64
Weight              int64
Fuel_Type_Diesel     bool
Fuel_Type_Petrol     bool
dtype: object


In [20]:
# Define numerical and categorical columns
numeric_features = ['Age_08_04', 'KM', 'HP', 'cc', 'Weight']

In [22]:
# Preprocessor with StandardScaler for numerical features only
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough'
)

In [24]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Verify the types in X_train and X_test after splitting
print("\nX_train columns and types:\n", X_train.dtypes)
print("X_test columns and types:\n", X_test.dtypes)


X_train columns and types:
 Age_08_04           int64
KM                  int64
HP                  int64
Automatic           int64
cc                  int64
Doors               int64
Cylinders           int64
Gears               int64
Weight              int64
Fuel_Type_Diesel     bool
Fuel_Type_Petrol     bool
dtype: object
X_test columns and types:
 Age_08_04           int64
KM                  int64
HP                  int64
Automatic           int64
cc                  int64
Doors               int64
Cylinders           int64
Gears               int64
Weight              int64
Fuel_Type_Diesel     bool
Fuel_Type_Petrol     bool
dtype: object


In [30]:
# Step 3: Model Pipeline Setup and Training
# Model 1: Basic Linear Regression with all features
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)

In [32]:
print("\nModel 1: Basic Linear Regression")
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("R²:", r2_score(y_test, y_pred_lr))
print("MAE:", mean_absolute_error(y_test, y_pred_lr))


Model 1: Basic Linear Regression
MSE: 2203043.823143704
R²: 0.8348888040611082
MAE: 990.887273919397


In [34]:
# Step 4: Apply Lasso and Ridge Regularization
# Lasso Regression
pipeline_lasso = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', Lasso(alpha=0.1))])
pipeline_lasso.fit(X_train, y_train)
y_pred_lasso = pipeline_lasso.predict(X_test)

In [36]:
print("\nLasso Regression")
print("MSE:", mean_squared_error(y_test, y_pred_lasso))
print("R²:", r2_score(y_test, y_pred_lasso))
print("MAE:", mean_absolute_error(y_test, y_pred_lasso))


Lasso Regression
MSE: 2202262.9135491783
R²: 0.8349473307757008
MAE: 990.8973956603818


In [38]:
# Ridge Regression
pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', Ridge(alpha=1.0))])
pipeline_ridge.fit(X_train, y_train)
y_pred_ridge = pipeline_ridge.predict(X_test)

In [40]:
print("\nRidge Regression")
print("MSE:", mean_squared_error(y_test, y_pred_ridge))
print("R²:", r2_score(y_test, y_pred_ridge))
print("MAE:", mean_absolute_error(y_test, y_pred_ridge))


Ridge Regression
MSE: 2198100.663084508
R²: 0.8352592783387948
MAE: 990.7191037857042
