In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge

# Load the dataset
data = pd.read_csv(r"C:\Users\admin\Downloads\MLR\MLR\ToyotaCorolla - MLR.csv")

# Encode categorical features
le = LabelEncoder()
data['Fuel_Type'] = le.fit_transform(data['Fuel_Type'])
data['Automatic'] = le.fit_transform(data['Automatic'])

# Split the data into training and testing sets
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: All Features
model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

# Model 2: Feature Selection
X_train_reduced = X_train[['Age_08_04', 'KM', 'HP', 'cc']]
X_test_reduced = X_test[['Age_08_04', 'KM', 'HP', 'cc']]
model2 = LinearRegression()
model2.fit(X_train_reduced, y_train)
y_pred2 = model2.predict(X_test_reduced)

# Model 3: Feature Transformation
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
model3 = LinearRegression()
model3.fit(X_train_poly, y_train)
y_pred3 = model3.predict(X_test_poly)

# Evaluate models
mse1 = mean_squared_error(y_test, y_pred1)
r2_1 = r2_score(y_test, y_pred1)
print("Model 1:")
print("Mean Squared Error:", mse1)
print("R-squared:", r2_1)

# Apply Lasso and Ridge Regression
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train, y_train)
y_pred_lasso = lasso_reg.predict(X_test)

ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_test)


Model 1:
Mean Squared Error: 2096851.1400177048
R-squared: 0.8428476112018002


In [1]:
# Explanation of Normalization & Standardization
print("\nNormalization scales data to a fixed range (0,1), while Standardization transforms data to have zero mean and unit variance.")
print("\nStandardization is useful for regression models to ensure features contribute equally and improve numerical stability.")

# Explanation of Multicollinearity Handling
print("\nTechniques to address multicollinearity:")
print("1. Variance Inflation Factor (VIF) to detect high correlation between features.")
print("2. Removing correlated features based on a correlation matrix.")
print("3. Principal Component Analysis (PCA) to reduce dimensionality.")
print("4. Ridge Regression (L2 Regularization) to shrink coefficients.")
print("5. Lasso Regression (L1 Regularization) to eliminate some features.")


Normalization scales data to a fixed range (0,1), while Standardization transforms data to have zero mean and unit variance.

Standardization is useful for regression models to ensure features contribute equally and improve numerical stability.

Techniques to address multicollinearity:
1. Variance Inflation Factor (VIF) to detect high correlation between features.
2. Removing correlated features based on a correlation matrix.
3. Principal Component Analysis (PCA) to reduce dimensionality.
4. Ridge Regression (L2 Regularization) to shrink coefficients.
5. Lasso Regression (L1 Regularization) to eliminate some features.
