### Q1: K-Fold Cross Validation for Multiple Linear Regression (Least Square Error Fit)  
    Download the dataset regarding USA House Price Prediction from the following link:  
    https://drive.google.com/file/d/1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX/view?usp=sharing  
    Load the dataset and Implement 5- fold cross validation for multiple linear regression 
    (using least square error fit).  
    Steps:  
    a) Divide the dataset into input features (all columns except price) and output variable  
    (price)  
    b) Scale the values of input features.  
    c) Divide input and output features into five folds.  
    d) Run five iterations, in each iteration consider one-fold as test set and remaining 
    four sets as training set. Find the beta (𝛽) matrix, predicted values, and R2_score 
    for each iteration using least square error fit.  
    e) Use the best value of (𝛽) matrix (for which R2_score is maximum), to train the regressor for 70% of the data and test the performance for    remaining 30% data

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import r2_score
#Reading the dataset
dataset=pd.read_csv('USA_Housing.csv')
dataset.head()
#Spliting the Label and the Rest of the features
input_dataset=dataset.drop(columns=['Price'],axis=1).values
label_dataset=dataset['Price'].values
#Scaling the dataset
scaler=StandardScaler()
input_dataset=scaler.fit_transform(input_dataset)
#Applying K folds
kf=KFold(n_splits=5,shuffle=True,random_state=42)
best_beta = None
best_r2 = -np.inf
fold_num = 1
for train_idx,test_idx in kf.split(input_dataset):
    X_train,X_test=input_dataset[train_idx],input_dataset[test_idx]
    Y_train,Y_test=label_dataset[train_idx],label_dataset[test_idx]
    #Adding biasing in the dataset the dataset becomes of the form->
    """
    [[1,x11,x12,....]
     [1,x21,x22,....]
     [1,x31,x32,....]]
    """
    X_train_with_intercept = np.c_[np.ones(len(X_train)), X_train]
    X_test_with_intercept = np.c_[np.ones(len(X_test)), X_test]
    #Applying the formula ((X^T * X)^-1) * (X^T)*Y
    beta = np.linalg.inv(X_train_with_intercept.T @ X_train_with_intercept) @ (X_train_with_intercept.T @ Y_train)
    Y_pred=X_test_with_intercept @ beta
    r2=r2_score(Y_test,Y_pred)
    if r2 > best_r2:
        best_r2=r2
        best_beta=beta
    fold_num+=1

X_train70, X_test30, y_train70, y_test30 = train_test_split(input_dataset, label_dataset, test_size=0.3, random_state=42)
X_test30_with_intercept = np.c_[np.ones(len(X_test30)), X_test30]
y_pred30 = X_test30_with_intercept @ best_beta
final_r2 = r2_score(y_test30, y_pred30)
print("\nFinal Evaluation with best beta on 70/30 split")
print("R2 Score on test set:", final_r2)
print("Best Beta Matrix:\n", best_beta)


Final Evaluation with best beta on 70/30 split
R2 Score on test set: 0.9147458156636434
Best Beta Matrix:
 [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]


#### Q2 Concept of Validation set for Multiple Linear Regression (Gradient Descent Optimization)  
    Consider the same dataset of Q1, rather than dividing the dataset into five folds, divide the dataset into training set (56%), validation set (14%), and test set (30%).  
    
    Consider four different values of learning rate i.e. {0.001,0.01,0.1,1}. Compute the values of 
    regression coefficients for each value of learning rate after 1000 iterations.  
    For each set of regression coefficients, compute R2 score for validation and test set and find the best value of regressor coefficients

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
df = pd.read_csv("USA_Housing.csv")

# Separate features and target
X = df.drop(columns=["Price"]).values
y = df["Price"].values.reshape(-1,1)

#Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

#Adding the ones column to the matrix
X = np.c_[np.ones((X.shape[0], 1)), X]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=30/44, random_state=42)

#Gradient Descent fo MLR
def gradient(X,Y,alpha,iterations):
    m,n=X.shape
    beta= np.zeros((n,1))
    for _ in range(iterations):
        y_beta=X @ beta
        err=y_beta-Y
        for j in range(n):
            grad_j = (1.0/m) * np.sum(err * X[:, [j]])
            beta[j,0]=beta[j,0]-alpha*grad_j
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
best_beta = None
best_alpha = None
best_val_r2 = -np.inf

for alpha in learning_rates:
    beta = gradient(X_train, y_train, alpha=alpha, iterations=1000)
    y_val_pred  = X_val  @ beta
    y_test_pred = X_test @ beta  
    val_r2  = r2_score(y_val,  y_val_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    if val_r2 > best_val_r2:
        best_val_r2 = val_r2
        best_beta = beta
        best_alpha = alpha

print("\nBest learning rate (by validation R²):", best_alpha)
print("Best validation R²:", best_val_r2)



Best learning rate (by validation R²): 0.1
Best validation R²: 0.9199649194854793


###   
Download the dataset regarding Car Price Prediction from the following link:  
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data  
1. Load the dataset with following column names ["symboling", "normalized_losses",  
"make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",  
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",  
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",  
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]  
and replace all ? values with NaN  
2. Replace all NaN values with central tendency imputation. Drop the rows with NaN  
values in price column  
3. There are 10 columns in the dataset with non-numeric values. Convert these values to  
numeric values using following scheme:  
(i) For “num_doors” and “num_cylinders”: convert words (number names) to figures  
for e.g., two to 2  
(ii) For "body_style", "drive_wheels": use dummy encoding scheme  
(iii) For “make”, “aspiration”, “engine_location”,fuel_type: use label encoding  
scheme  
(iv) For fuel_system: replace values containing string pfi to 1 else all values to 0.  
(v) For engine_type: replace values containing string ohc to 1 else all values to 0.  
4. Divide the dataset into input features (all columns except price) and output variable  
(price). Scale all input features.  
5. Train a linear regressor on 70% of data (using inbuilt linear regression function of  
Python) and test its performance on remaining 30% of data.  
6. Reduce the dimensionality of the feature set using inbuilt PCA decomposition and then  
again train a linear regressor on 70% of reduced data (using inbuilt linear regression  
function of Python). Does it lead to any performance improvement on test set?  

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA

# 1. Loading the dataset
cols = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
        "num_doors", "body_style", "drive_wheels", "engine_location",
        "wheel_base", "length", "width", "height", "curb_weight",
        "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore",
        "stroke", "compression_ratio", "horsepower", "peak_rpm",
        "city_mpg", "highway_mpg", "price"]

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
    names=cols
)

#Replacing the ? with NAN in the dataset
df.replace("?", np.nan, inplace=True)

#Replacing the categorical variables with mode and other with mean
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

#Filtering out the non null rows
df = df[df['price'].notna()]
numeric_cols = ["symboling", "normalized_losses", "wheel_base", "length", "width",
                "height", "curb_weight", "engine_size", "bore", "stroke",
                "compression_ratio", "horsepower", "peak_rpm",
                "city_mpg", "highway_mpg", "price"]
df[numeric_cols] = df[numeric_cols].astype(float)

#Converting to numeric values
num_map = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}
df["num_doors"] = df["num_doors"].replace(num_map)
df["num_cylinders"] = df["num_cylinders"].replace(num_map)

#Dummy encoding
df = pd.get_dummies(df, columns=["body_style", "drive_wheels"], drop_first=True)

#Label encoding
for col in ["make", "aspiration", "engine_location", "fuel_type"]:
    df[col] = LabelEncoder().fit_transform(df[col])
df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if "pfi" in x.lower() else 0)
df["engine_type"] = df["engine_type"].apply(lambda x: 1 if "ohc" in x.lower() else 0)


#Splitting the dataset
X = df.drop(columns=["price"]).values
y = df["price"].values

# Scaling input features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Train Linear Regression without PCA
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Without PCA:")
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

# PCA decomposition
pca = PCA(n_components=0.95)  # keeping 95% variance
X_pca = pca.fit_transform(X)

Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    X_pca, y, test_size=0.3, random_state=42
)

lr_pca = LinearRegression()
lr_pca.fit(Xp_train, yp_train)
yp_pred = lr_pca.predict(Xp_test)

print("\nWith PCA:")
print("R² Score:", r2_score(yp_test, yp_pred))
print("MSE:", mean_squared_error(yp_test, yp_pred))

Without PCA:
R² Score: 0.7895045576733848
MSE: 14448999.01183785

With PCA:
R² Score: 0.7478420860380318
MSE: 17308828.207359686


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df["num_doors"] = df["num_doors"].replace(num_map)
  df["num_cylinders"] = df["num_cylinders"].replace(num_map)
