In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [69]:
df= pd.read_csv("https://raw.githubusercontent.com/Micahgs/DATA-science-asiignment-datasets/refs/heads/main/ToyotaCorolla%20-%20MLR.csv")

In [70]:
print("Shape of data:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)
print("\nSummary statistics:\n", df.describe())

Shape of data: (1436, 11)

Missing values:
 Price        0
Age_08_04    0
KM           0
Fuel_Type    0
HP           0
Automatic    0
cc           0
Doors        0
Cylinders    0
Gears        0
Weight       0
dtype: int64

Data types:
 Price         int64
Age_08_04     int64
KM            int64
Fuel_Type    object
HP            int64
Automatic     int64
cc            int64
Doors         int64
Cylinders     int64
Gears         int64
Weight        int64
dtype: object

Summary statistics:
               Price    Age_08_04             KM           HP    Automatic  \
count   1436.000000  1436.000000    1436.000000  1436.000000  1436.000000   
mean   10730.824513    55.947075   68533.259749   101.502089     0.055710   
std     3626.964585    18.599988   37506.448872    14.981080     0.229441   
min     4350.000000     1.000000       1.000000    69.000000     0.000000   
25%     8450.000000    44.000000   43000.000000    90.000000     0.000000   
50%     9900.000000    61.000000   63389.50000

# Encoding on Categorical column

In [71]:
df = pd.get_dummies(df, columns=['Fuel_Type'], drop_first=True)

In [72]:
df[['Fuel_Type_Diesel', 'Fuel_Type_Petrol']] = df[['Fuel_Type_Diesel', 'Fuel_Type_Petrol']].astype(int)


In [73]:
print(df.dtypes)


Price               int64
Age_08_04           int64
KM                  int64
HP                  int64
Automatic           int64
cc                  int64
Doors               int64
Cylinders           int64
Gears               int64
Weight              int64
Fuel_Type_Diesel    int64
Fuel_Type_Petrol    int64
dtype: object


# Splitting the data

In [74]:
x = df.drop("Price",axis=1)
y = df['Price']
x

Unnamed: 0,Age_08_04,KM,HP,Automatic,cc,Doors,Cylinders,Gears,Weight,Fuel_Type_Diesel,Fuel_Type_Petrol
0,23,46986,90,0,2000,3,4,5,1165,1,0
1,23,72937,90,0,2000,3,4,5,1165,1,0
2,24,41711,90,0,2000,3,4,5,1165,1,0
3,26,48000,90,0,2000,3,4,5,1165,1,0
4,30,38500,90,0,2000,3,4,5,1170,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1431,69,20544,86,0,1300,3,4,5,1025,0,1
1432,72,19000,86,0,1300,3,4,5,1015,0,1
1433,71,17016,86,0,1300,3,4,5,1015,0,1
1434,70,16916,86,0,1300,3,4,5,1015,0,1


# Normalize numerical features

In [75]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)
X_scaled

Unnamed: 0,Age_08_04,KM,HP,Automatic,cc,Doors,Cylinders,Gears,Weight,Fuel_Type_Diesel,Fuel_Type_Petrol
0,-1.771966,-0.574695,-0.768042,-0.242893,0.997419,-1.085139,0.0,-0.140425,1.758561,2.874807,-2.710874
1,-1.771966,0.117454,-0.768042,-0.242893,0.997419,-1.085139,0.0,-0.140425,1.758561,2.874807,-2.710874
2,-1.718184,-0.715386,-0.768042,-0.242893,0.997419,-1.085139,0.0,-0.140425,1.758561,2.874807,-2.710874
3,-1.610620,-0.547650,-0.768042,-0.242893,0.997419,-1.085139,0.0,-0.140425,1.758561,2.874807,-2.710874
4,-1.395491,-0.801028,-0.768042,-0.242893,0.997419,-1.085139,0.0,-0.140425,1.853577,2.874807,-2.710874
...,...,...,...,...,...,...,...,...,...,...,...
1431,0.702015,-1.279939,-1.035138,-0.242893,-0.652594,-1.085139,0.0,-0.140425,-0.901883,-0.347849,0.368885
1432,0.863362,-1.321120,-1.035138,-0.242893,-0.652594,-1.085139,0.0,-0.140425,-1.091915,-0.347849,0.368885
1433,0.809579,-1.374036,-1.035138,-0.242893,-0.652594,-1.085139,0.0,-0.140425,-1.091915,-0.347849,0.368885
1434,0.755797,-1.376703,-1.035138,-0.242893,-0.652594,-1.085139,0.0,-0.140425,-1.091915,-0.347849,0.368885


# Train test split

In [76]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# **Building models**

# MLR

In [77]:
lr = LinearRegression()
lr.fit(x_train, y_train)

# Ridge Regression

In [78]:
ridge = Ridge(alpha=1.0)
ridge.fit(x_train, y_train)

# Lasso Regression

In [79]:
lasso = Lasso(alpha=0.1)
lasso.fit(x_train, y_train)

In [80]:
import pandas as pd

coef_df = pd.DataFrame({
    'Feature': x.columns,
    'Linear Coef': lr.coef_,
    'Ridge Coef': ridge.coef_,
    'Lasso Coef': lasso.coef_
})

print(coef_df)



             Feature   Linear Coef   Ridge Coef   Lasso Coef
0          Age_08_04 -1.208305e+02  -120.779659  -120.827216
1                 KM -1.623141e-02    -0.016324    -0.016241
2                 HP  1.403948e+01    14.141834    14.091453
3          Automatic  1.488309e+02   146.907470   147.287981
4                 cc -3.037219e-02    -0.030509    -0.030575
5              Doors -6.031097e+01   -59.905791   -60.147193
6          Cylinders -1.620037e-12     0.000000     0.000000
7              Gears  5.516007e+02   542.272328   548.658386
8             Weight  2.588496e+01    25.821019    25.868659
9   Fuel_Type_Diesel -6.854876e+01  -128.813072   -64.924880
10  Fuel_Type_Petrol  1.370809e+03  1294.949826  1370.167198


In [81]:


def evaluate_model(model, name):
    y_pred = model.predict(x_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f" {name}")
    print(f"R² Score : {r2:.3f}")
    print(f"MSE      : {mse:.2f}")
    print(f"RMSE     : {rmse:.2f}\n")

# Evaluate all models
evaluate_model(lr, "Linear Regression")
evaluate_model(ridge, "Ridge Regression")
evaluate_model(lasso, "Lasso Regression")


 Linear Regression
R² Score : 0.835
MSE      : 2203043.82
RMSE     : 1484.27

 Ridge Regression
R² Score : 0.835
MSE      : 2199746.37
RMSE     : 1483.15

 Lasso Regression
R² Score : 0.835
MSE      : 2202270.26
RMSE     : 1484.00



# Interview Questions

# **Q1: What is Normalization & Standardization and how is it helpful**?
Normalization and standardization are techniques used to bring all the numerical features of a dataset to a similar scale, which helps improve the performance and stability of machine learning models.

**Standardization** means adjusting the values of features so that they have a mean of 0 and a standard deviation of 1. It makes the data centered and spread out evenly.


**Normalization** means scaling all values of a feature to fall between 0 and 1. It is commonly used when we want to keep all values within a fixed range.

# **2: What techniques can be used to address multicollinearity in multiple linear regression?**

Multicollinearity happens when two or more independent variables in a regression model are highly related to each other. This can cause problems like unstable or misleading coefficients, which makes it hard to understand which variables are actually important.

**Here are some common techniques to handle multicollinearity:**

Remove one of the correlated features: If two columns are giving similar information, we can keep one and drop the other.

Use VIF (Variance Inflation Factor): This is a statistical method used to detect multicollinearity. If a feature has a high VIF value (usually above 5 or 10), it may be causing multicollinearity and should be considered for removal.

Apply Ridge Regression: Ridge is a variation of linear regression that can handle multicollinearity well by shrinking the coefficients of correlated variables.

Use Lasso Regression: Lasso can help by reducing the importance of less useful variables to zero, which simplifies the model and reduces multicollinearity.

# **Assumptions Made During the Analysis and Their Implications**
**1. Linearity**
We assume that there is a linear relationship between the independent variables (features) and the dependent variable (price). This means we expect the change in price to be directly proportional to the change in features like age, kilometers driven, horsepower, etc.

Implication: If this assumption is violated, linear regression may not be the right model, and we might need to consider non-linear models or add polynomial terms.

**2. Independence of Observations**
Each observation (row in the dataset) is assumed to be independent of the others. In other words, the value of one record should not affect another.

Implication: If the observations are dependent (e.g., multiple rows from the same car), it can lead to biased results.

**3. Homoscedasticity**
We assume that the variance of the errors (residuals) is constant across all levels of the independent variables. This means that the spread of the residuals should not change as the value of the features changes.

Implication: If the residuals increase or decrease in a pattern, it may affect the accuracy of predictions and confidence intervals.

**4. No Multicollinearity**
We assume that the independent variables are not highly correlated with each other. High correlation between features can confuse the model and make the coefficients unstable.

Implication: Multicollinearity can make it difficult to understand which features actually affect the outcome. This was handled using Ridge and Lasso regression, which help to reduce the impact of multicollinearity.
