# CSC 466 Lab 0
By Lucas Summers

lsumme01@calpoly.edu

In [None]:
import pandas as pd
import numpy as np
import math
import time

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
print(iris_df.head())

In [None]:
from sklearn.datasets import fetch_california_housing

cali = fetch_california_housing(as_frame=True)
cali_df = cali.frame.drop(['Longitude', 'Latitude'], axis=1)
print(cali_df.head())

In [None]:
def run_metric(df, metric_handler, metric_type, extremes, metric_func=None):
    start_t = time.time()
    
    if metric_func:  
        pairs = metric_handler(df, metric_func)
    else:
        pairs = metric_handler(df)
        
    most_similar, most_dissimilar = extremes(pairs, metric_type)
    
    end_t = time.time()
    
    print("Most Similar Pair (Index, Index, Distance):", most_similar)
    print("Most Dissimilar Pair (Index, Index, Distance):", most_dissimilar)
    print(f"Execution Time: {end_t - start_t:.4f} seconds\n")

# Version 1

In [None]:
def compute_pairwise(df, metric):
    n = len(df)
    results = []
    for i in range(n):
        for j in range(i+1, n):
            val = metric(df.iloc[i], df.iloc[j])
            results.append((i, j, val))
    return results

In [None]:
def manhattan_distance(x, y):
    return sum(abs(a - b) for a, b in zip(x,y))

In [None]:
def cosine_similarity(x, y):
    dot = sum(a*b for a, b in zip(x,y))
    mag_x = math.sqrt(sum(a**2 for a in x))
    mag_y = math.sqrt(sum(b**2 for b in y))
    
    if mag_x == 0 or mag_y == 0:
        return 0
    return dot / (mag_x * mag_y)

In [None]:
def find_minmax(pairs, metric_type):
    if metric_type == "distance":
        most_similar = min(pairs, key=lambda x: x[2])
        most_dissimilar = max(pairs, key=lambda x: x[2])
    elif metric_type == "similarity":
        most_similar = max(pairs, key=lambda x: x[2])
        most_dissimilar = min(pairs, key=lambda x: x[2])
    return most_similar, most_dissimilar

In [None]:
run_metric(iris_df, compute_pairwise, "distance", find_minmax, manhattan_distance)
run_metric(iris_df, compute_pairwise, "similarity", find_minmax, cosine_similarity)
run_metric(cali_df, compute_pairwise, "distance", find_minmax, manhattan_distance)
run_metric(cali_df, compute_pairwise, "similarity", find_minmax, cosine_similarity)

# Version 2

In [None]:
def find_minmax_np(matrix, metric_type):
    matrix = matrix.copy()
    np.fill_diagonal(matrix, np.nan)
    if metric_type == "distance":
        min_idx = np.unravel_index(np.nanargmin(matrix), matrix.shape)
        max_idx = np.unravel_index(np.nanargmax(matrix), matrix.shape)
        return (*min_idx, matrix[min_idx]), (*max_idx, matrix[max_idx])
    elif metric_type == "similarity":
        max_idx = np.unravel_index(np.nanargmax(matrix), matrix.shape)
        min_idx = np.unravel_index(np.nanargmin(matrix), matrix.shape)
        return (*max_idx, matrix[max_idx]), (*min_idx, matrix[min_idx])

In [None]:
def manhattan_distance_np(df):
    df = df.to_numpy()
    diff = np.abs(df[:, np.newaxis, :] - df[np.newaxis, :, :])
    return np.sum(diff, axis=2)

In [None]:
def cosine_similarity_np(df):
    df = df.to_numpy()
    dot = np.dot(df, df.T)
    mags = np.linalg.norm(df, axis=1)
    mags = np.outer(mags, mags)
    with np.errstate(divide='ignore', invalid='ignore'):
        sim = np.divide(dot, mags)
        sim[~np.isfinite(sim)] = 0
    return sim

In [None]:
run_metric(iris_df, manhattan_distance_np, "distance", find_minmax_np)
run_metric(iris_df, cosine_similarity_np, "similarity", find_minmax_np)
run_metric(cali_df, manhattan_distance_np, "distance", find_minmax_np)
run_metric(cali_df, cosine_similarity_np, "similarity", find_minmax_np)

# Version 3

In [None]:
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [None]:
def manhattan_distance_sk(df):
    return pairwise_distances(df, metric='manhattan')

In [None]:
def cosine_similarity_sk(df):
    return cosine_similarity(df)

In [None]:
run_metric(iris_df, manhattan_distance_sk, "distance", find_minmax_np)
run_metric(iris_df, cosine_similarity_sk, "similarity", find_minmax_np)
run_metric(cali_df, manhattan_distance_sk, "distance", find_minmax_np)
run_metric(cali_df, cosine_similarity_sk, "similarity", find_minmax_np)