**CS 4774: Machine Learning Final Project - KMeans Approach**

## Data Preprocessing

We'll utilize pandas, numpy, and sklearn to preprocess our data, imputing numerical features and applying one-hot encoding to categorical features. 

In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# training data:
train_raw = pd.read_csv('data/train.csv')
test_raw = pd.read_csv('data/test.csv')

print("Train data shape: ", train_raw.shape)
print("Test data shape: ", test_raw.shape)
print(train_raw.head())

# get features and targets:
X_train = train_raw.drop("SalePrice", axis=1)
y_train = train_raw["SalePrice"]
X_test = test_raw.copy()  # can just copy as test.csv doesn't have the target

# separate numerical and categorical features:
numFeatures = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
catFeatures = X_train.select_dtypes(include=['object']).columns.tolist()

# utilize pipelines for preprocessing:

numPipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())])

catPipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

# combine workflows:
preprocessor = ColumnTransformer([('numerical', numPipeline, numFeatures),
                                  ('categorical', catPipeline, catFeatures)])

# now, fit and transform data:

# use preprocessor to process train and test data:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# convert to pd dataframes:

# need to concatenate processed numerical and categorical features:
numFeature_names = numFeatures
catFeature_names = preprocessor.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(catFeatures)

# concatenate
totalFeatures = np.concatenate((numFeature_names, catFeature_names))

# convert to dataframes:
X_train_processed = pd.DataFrame(X_train_processed, columns=totalFeatures)
X_test_processed = pd.DataFrame(X_test_processed, columns=totalFeatures)

print("Processed Train dataset: ", X_train_processed.shape)
print("Processed Test dataset: ", X_test_processed.shape)
print(X_train_processed.head())

# write out preprocessed data:
X_train_processed.to_csv('data/train_processed.csv', index=False)
X_test_processed.to_csv('data/test_processed.csv', index=False)

Train data shape:  (1460, 81)
Test data shape:  (1459, 80)
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold 

## Implementation of K Nearest Neighbors

In [22]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

class KNN:
    def __init__(self, k, distance):
        self.k = k
        self.distance = distance
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.pca = None
        
    # fit
    def fit(self, X, y, pca_components=None):
        # fit with PCA (if chosen):
        if pca_components is not None:
            self.pca = PCA(n_components=pca_components)
            self.X_train = self.pca.fit_transform(X)
            
        # normal fit:
        else:
            self.X_train = X if isinstance(X, list) else X
        self.y_train = y if isinstance(y, list) else y
        
        # convert to numpy arrays:
        self.X_train = np.array(self.X_train)
        self.y_train = np.array(self.y_train)
        
    # helper method to calculate distances:
    def calculateDistance(self, p1, p2):
        p1 = np.array(p1, dtype=float)
        p2 = np.array(p2, dtype=float)
        if self.distance.lower() == 'manhattan':
            return np.sum(np.abs(p1 - p2))
        elif self.distance.lower() == 'euclidian':
            return np.sqrt(np.sum(p1 - p2) ** 2) 
        
        else:
            return None  # invalid metric
         
    # predict
    def predict(self, X):
        if self.pca:
            X = self.pca.transform(X)
        else:
            X = X.values if isinstance(X, pd.DataFrame) else X
            
        predictions = []
        
        for x in X:
            # compute distance:
            distances = [self.calculateDistance(x, x_train) for x_train in self.X_train]
            
            # get knns
            knn_indices = np.argsort(distances)[:self.k]
            knn_distances = [distances[i] for i in knn_indices]
            knn_prices = [self.y_train[i] for i in knn_indices]
            
            # calculate prediction using mean price of knns:
            prediction = np.mean(knn_prices)
            
            # append prediction:
            predictions.append(prediction)
            
        return np.array(predictions)
    
    # calculate RMSE:
    def RMSE(self, X, y):
        y_pred = self.predict(X)
        return np.sqrt(mean_squared_error(y, y_pred))


## Application of Model:

In [26]:
from sklearn.model_selection import train_test_split

# load data:
X = pd.read_csv('data/train_processed.csv')
y = pd.read_csv('data/train.csv')['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

bestRMSE_stats = [None, None, None]  # format: RMSE, k, pca

# find the best choice of k:
for k in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
    for pca in [2, 3, 5, 10, 15, 25, 30]:
        knn = KNN(k=k, distance='euclidian')
        knn.fit(X_train, y_train, pca_components=pca)
        RMSE = knn.RMSE(X_train, y_train)
        print(f'k = {k}, pca = {pca} RMSE = {RMSE:.2f}')
        
        if bestRMSE_stats[0] is None:
            bestRMSE_stats = [RMSE, k, pca]
        elif bestRMSE_stats[0] > RMSE:
            bestRMSE_stats = [RMSE, k, pca]

print(f'Best Configuration: k: {bestRMSE_stats[1]} | pca: {bestRMSE_stats[2]} | RMSE: {bestRMSE_stats[0]}')

# make model using optimized configuration
best_model = KNN(k=bestRMSE_stats[1], distance='euclidian')
best_model.fit(X_train, y_train, pca_components=bestRMSE_stats[2])

# run on test set:
X_test = pd.read_csv('data/test_processed.csv')
test_predictions = best_model.predict(X_test)

# write out predictions:
pd.DataFrame(test_predictions, columns=["PredictedPrice"]).to_csv('data/test_predictions.csv', index=False)

k = 2, pca = 2 RMSE = 35699.39
k = 2, pca = 3 RMSE = 40118.34
k = 2, pca = 5 RMSE = 47691.48
k = 2, pca = 10 RMSE = 61460.08
k = 2, pca = 15 RMSE = 55613.78
k = 2, pca = 25 RMSE = 50532.88
k = 2, pca = 30 RMSE = 66349.73
k = 3, pca = 2 RMSE = 41738.26
k = 3, pca = 3 RMSE = 44421.50
k = 3, pca = 5 RMSE = 53819.97
k = 3, pca = 10 RMSE = 59735.02
k = 3, pca = 15 RMSE = 59545.56
k = 3, pca = 25 RMSE = 54666.19
k = 3, pca = 30 RMSE = 65539.69
k = 4, pca = 2 RMSE = 44773.15
k = 4, pca = 3 RMSE = 47179.56
k = 4, pca = 5 RMSE = 53849.89
k = 4, pca = 10 RMSE = 58599.23
k = 4, pca = 15 RMSE = 58336.10
k = 4, pca = 25 RMSE = 58123.12
k = 4, pca = 30 RMSE = 65869.66
k = 5, pca = 2 RMSE = 46417.51
k = 5, pca = 3 RMSE = 47915.27
k = 5, pca = 5 RMSE = 53737.37
k = 5, pca = 10 RMSE = 59294.28
k = 5, pca = 15 RMSE = 58834.85
k = 5, pca = 25 RMSE = 60628.17
k = 5, pca = 30 RMSE = 66384.28
k = 6, pca = 2 RMSE = 47334.39
k = 6, pca = 3 RMSE = 49150.55
k = 6, pca = 5 RMSE = 56003.79
k = 6, pca = 10 RMSE = 