# CSC 466 Lab 1

By Lucas Summers

lsumme01@calpoly.edu

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
iris_df = iris.frame
print(iris_df.head())

In [None]:
from sklearn.datasets import fetch_california_housing

cali = fetch_california_housing(as_frame=True)
cali_df = cali.frame.drop(['Longitude', 'Latitude'], axis=1).iloc[:5000]
print(cali_df.head())

In [None]:
def manhattan_distance(x, y):
    diff = np.abs(x - y)
    return np.sum(diff, axis=1)

In [None]:
def cosine_similarity(x, y):
    dot = np.dot(x, y)
    xmag = np.linalg.norm(x, axis=1)
    ymag = np.linalg.norm(y)
    
    with np.errstate(divide='ignore', invalid='ignore'):
        sim = dot / (xmag * ymag)
        sim[~np.isfinite(sim)] = 0
    return sim

In [None]:
from collections import Counter
def run_knn(train, labels, d, k, metric, model):
    if metric == 'cosine':
        scores = cosine_similarity(train, d)
        knearest = np.argsort(scores)[-k:][::-1]
    elif metric == 'manhattan':
        scores = manhattan_distance(train, d)
        knearest = np.argsort(scores)[:k]
    else:
        raise ValueError("Metric must be 'cosine' or 'manhattan'")

    if model == 'classify':
        label_counts = Counter()
        for i in knearest:
            label_counts[labels.iloc[i]] += 1
        return label_counts.most_common(1)[0][0]
    elif model == 'regression':
        label_sum = 0
        for i in knearest:
            label_sum += labels.iloc[i]
        
        return label_sum / k
    else:
        raise ValueError("Model must be 'classify' or 'regression'")

In [None]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import pairwise_distances
def run_knn_sk(train, labels, d, k, metric, model):
    if model == 'classify':
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
    elif model == 'regression':
        knn = KNeighborsRegressor(n_neighbors=k, metric=metric)
    else:
        raise ValueError("Model must be 'classify' or 'regression'")

    d = d.reshape(1, -1)
    knn.fit(train, labels)
    pred = knn.predict(d)
    return pred[0]

In [None]:
def calc_accuracy(test, pred):
    return np.sum(test == pred) / len(test)

In [None]:
def calc_sse(test, pred):
    return np.sum((test - pred) ** 2)    

In [None]:
def calc_mse(test, pred):
    return calc_sse(test, pred) / len(test)

In [None]:
def calc_mae(test, pred):
    return np.mean(np.abs(test - pred))

In [None]:
def evaluate_knn(train_data, train_labels, test_data, test_labels, k, knn, metric='cosine', model='classify'):
    pred = np.empty(len(test_data), dtype=object)
    for i, d in enumerate(test_data):
       pred[i] = knn(train_data, train_labels, d, k, metric, model)

    if model == 'classify':
        accuracy = calc_accuracy(test_labels, pred)
        #avg_f1 = calc_average_f1(test_labels, pred)
        #print(f"Accuracy: {accuracy: .2%}")
        #print(f"Average F1-measure: {avg_f1: .2%}")
        return {'accuracy': accuracy}
    elif model == 'regression':
        sse = calc_sse(test_labels, pred)
        mse = calc_mse(test_labels, pred)
        mae = calc_mae(test_labels, pred)
        #print(f"Sum Squared Error (SSE): {sse: .4}")
        #print(f"Mean Squared Error (MSE): {mse: .4}")
        #print(f"Mean Absolute Error (MAE): {mae: .4}")
        return {'sse': sse, 'mse': mse, 'mae': mae}
    else:
        raise ValueError("Model must be 'classify' or 'regression'")

In [None]:
def grid_search(train_data, train_labels, test_data, test_labels, knn, model='classify', max_k=10):
    results = []
    for k in range(1, max_k+1):
        #print(f"KNN {model} using Cosine Similarity (k = {k})")
        cosine_result = evaluate_knn(train_data, train_labels, test_data, test_labels, k, knn, metric='cosine', model=model)
        results.append({'k': k, 'metric': 'cosine', **cosine_result})
        #print(f"KNN {model} using Manhattan Distance (k = {k})")
        man_result = evaluate_knn(train_data, train_labels, test_data, test_labels, k, knn, metric='manhattan', model=model)
        results.append({'k': k, 'metric': 'manhattan', **man_result})

    if model == 'classify':
        best_k = min(
            results, 
            key=lambda x: (-x['accuracy'], x['k'])  # Sort by accuracy (descending) and k (ascending)
        )
    elif model == 'regression':
        best_k = min(
            results, 
            key=lambda x: (x['sse'], x['k'])  # Sort by SSE (ascending) and k (ascending)
        )
    print(f"Best KNN {model} model:")
    for name, val in best_k.items():
        print(f"{name}: {val}")
    
    return results

In [None]:
def split_train_test(df, target, test_size=0.2, random_state=0):
    np.random.seed(random_state)
    
    indices = np.arange(len(df))
    np.random.shuffle(indices)
    split_idx = int(len(df) * (1 - test_size))

    train_indices = indices[:split_idx]
    test_indices = indices[split_idx:]
    
    train_df = df.iloc[train_indices]
    test_df = df.iloc[test_indices]
    
    train_data = train_df.drop(columns=target).reset_index(drop=True)
    train_labels = train_df[target].reset_index(drop=True)
    test_data = test_df.drop(columns=target).reset_index(drop=True)
    test_labels = test_df[target].reset_index(drop=True)

    test_data = test_data.to_numpy(dtype=float)
    train_data = train_data.to_numpy(dtype=float)
    
    return train_data, test_data, train_labels, test_labels

In [None]:
import matplotlib.pyplot as plt

def plot_results(results, title, model='classify'):
    cosine_results = [r for r in results if r['metric'] == 'cosine']
    manhattan_results = [r for r in results if r['metric'] == 'manhattan']
    k_values = [r['k'] for r in cosine_results]

    if model == 'classify':
        cosine_acc = [r['accuracy'] for r in cosine_results]
        manhattan_acc = [r['accuracy'] for r in manhattan_results]
        plt.plot(k_values, cosine_acc, label='Cosine', marker='o')
        plt.plot(k_values, manhattan_acc, label='Manhattan', marker='s')
        plt.ylabel('Accuracy')
    elif model == 'regression':
        cosine_sse = [r['sse'] for r in cosine_results]
        manhattan_sse = [r['sse'] for r in manhattan_results]
        plt.plot(k_values, cosine_sse, label='Cosine', marker='o')
        plt.plot(k_values, manhattan_sse, label='Manhattan', marker='s')
        plt.ylabel('Sum Squared Error (SSE)')

    plt.xlabel('k')
    plt.title(title)
    plt.legend()
    plt.grid()
    plt.show()

# Custom KNN Classification

In [None]:
train_data, test_data, train_labels, test_labels = split_train_test(
    iris_df, 'target', test_size=0.2, random_state=0
)
results = grid_search(train_data, train_labels, test_data, test_labels, run_knn, model='classify', max_k=10)
plot_results(results, title='Custom KNN Classify Performance', model='classify')

# SKLEARN KNN Classification

In [None]:
train_data, test_data, train_labels, test_labels = split_train_test(
    iris_df, 'target', test_size=0.2, random_state=0
)
results = grid_search(train_data, train_labels, test_data, test_labels, run_knn_sk, model='classify', max_k=10)
plot_results(results, title='SK Learn KNN Classify Performance', model='classify')

# Custom KNN Regression

In [None]:
train_data, test_data, train_labels, test_labels = split_train_test(
    cali_df, 'MedHouseVal', test_size=0.2, random_state=0
)
results = grid_search(train_data, train_labels, test_data, test_labels, run_knn, model='regression', max_k=10)
plot_results(results, title='Custom KNN Regression Performance', model='regression')

# SKLEARN KNN Regression 

In [None]:
train_data, test_data, train_labels, test_labels = split_train_test(
    cali_df, 'MedHouseVal', test_size=0.2, random_state=0
)
results = grid_search(train_data, train_labels, test_data, test_labels, run_knn_sk, model='regression', max_k=10)
plot_results(results, title='SK Learn Regression Performance', model='regression')