# HW1
Based on the examples provided, make your own class for implementing locally weighted regression to work with multiple features, and also train and test data. Show an application to a real data set with your implementation, and present the 10-fold cross-validated mean square error.


In [1]:
# Libraries of functions need to be imported
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist
from sklearn.linear_model import Ridge

## Setting Up
- Importing data
- Setting X and y variables
- Test train split
- Scaling

In [2]:
# importing data
data = pd.read_csv('cars.csv')
data.head()

Unnamed: 0,MPG,CYL,ENG,WGT
0,18.0,8,307.0,3504
1,15.0,8,350.0,3693
2,18.0,8,318.0,3436
3,16.0,8,304.0,3433
4,17.0,8,302.0,3449


In [3]:
# Setting X and y variables
y = data['MPG'].values
X = data.drop(columns = ['MPG']).values

In [4]:
# splitting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# scaling the data using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Creating class to implement Locally Weighted Regression

In [6]:
class Lowess:
    def __init__(self, tau, kernel_method):
        self.tau = tau
        self.kernel_method = kernel_method  
        
    #Defining Kernels
    def Gaussian(self, x):
        return np.where(np.abs(x) > 4, 0, 1 / (np.sqrt(2 * np.pi)) * np.exp(-0.5 * x**2))
    
    def Tricubic(self, x):
        return np.where(np.abs(x) > 1, 0, (1 - np.abs(x)**3)**3)
    
    def Epanechnikov(self, x):
        return np.where(np.abs(x) > 1, 0, 3 / 4 * (1 - x**2))
    
    def Quartic(self, x):
        return np.where(np.abs(x) > 1, 0, 15 / 16 * (1 - x**2)**2)
        

    # Computing weights for each point in X_train based on its distance from x0 using specified kernel function
    def _compute_weights(self, X_train, x0):
        #Computing the Euclidean distance between the test point x0 and all training points
        distances = cdist([x0], X_train, metric='Euclidean').flatten()
        # Retrieving kernel function specified by self.kernel_method
        kernel_function = getattr(self, self.kernel_method) 
        # Applying kernel function to scaled distances
        weights = kernel_function(distances / (2 * self.tau))
        return weights

    # Computing the regression parameters (theta) for a given test point x0
    def _locally_weighted_regression(self, X_train, y_train, x0):
        # Getting the weights for the training data based on the distance to x0
        weights = self._compute_weights(X_train, x0)
        #Converting the weights into diagonal matrix
        weight = np.diag(weights)
        # Adding an intercept term to the training features
        X_train_augmented = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
        #Computing the locally weighted regression parameters using weighted least squares
        theta = np.linalg.pinv(X_train_augmented.T @ weight @ X_train_augmented) @ X_train_augmented.T @ weight @ y_train
        return theta

    #Fitting the model and making predictions for test data
    def fit_and_predict(self, X_train, y_train, X_test):
        #Initializing array to store predictions
        predictions = np.zeros(X_test.shape[0])
        #For each test point x0 computing the locally weighted regression parameters (theta)
        # and augmenting x0 with an intercept term and computing the prediction
        for i, x0 in enumerate(X_test):
            theta = self._locally_weighted_regression(X_train, y_train, x0)
            X0_augmented = np.hstack(([1], x0))  
            predictions[i] = X0_augmented @ theta
        
        return predictions

    # Performing K-Fold cross-validation to evaluate model performance
    def cross_validate(X_train, y_train, model_class, tau, kernel_method, n_splits=10):
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=123)
        errors = []

        for train_index, test_index in kf.split(X_train):
            X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
            y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    
            # Initializing model with the specified kernel method
            model = model_class(tau=tau, kernel_method=kernel_method)
    
            # Fitting the model and making predictions
            y_pred_fold = model.fit_and_predict(X_train_fold, y_train_fold, X_test_fold)
    
            # Calculating and storing mse for the fold
            fold_mse = mse(y_test_fold, y_pred_fold)
            errors.append(fold_mse)
    
        # Return the mean of the cv errors
        return np.mean(errors)

In [7]:
# Perform 10-fold cross-validation on training data
mean_mse = Lowess.cross_validate(X_train_scaled, y_train, Lowess, tau=.08, kernel_method='Gaussian')
print(f'10-Fold Cross-Validated Mean Squared Error: {mean_mse:.4f}')

10-Fold Cross-Validated Mean Squared Error: 16.6525
