In [5]:
import pandas as pd
import numpy as np
import random

In [6]:
def sq_w_euc_dist(A, B, w = None):
    if w:
        return np.sum(((A - B) ** 2) * w)
    return np.mean((A - B) ** 2)

In [7]:
def cos_dist(A, B):
    return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))

In [8]:
def w_randomizer(n):
    weights = []
    for i in range(n):
        weights.append(random.random())
    weights /= np.sum(weights)
    return weights

In [30]:
def find_closest_euc(X, y, n = 1, w = None):
    """
    Inputs:
        X - dataframe of rows to compare between
        y - target row to compare to
        n - number of closest rows
        w - weights for the euclidean distance function
    
    Output:
        sim_list - list of most similar values
    """
    indices = []
    dists = []
    for ind, row in X.iterrows():
        indices.append(ind)
        dists.append(sq_w_euc_dist(row, y, w))
    return [indices[ind] for ind in np.argsort(dists)[:n]]

In [6]:
def find_approx_closest_euc(X, y, comp, vals, n = 1, w = None):
    """
    Inputs:
        X - dataframe of rows to compare between
        y - target row to compare to
        n - number of closest rows
        w - weights for the euclidean distance function
        comp - dimensions to pre-compare along
        vals - absolute differences that drops vals from consideration
    
    Output:
        sim_list - list of most similar values
    """

    comp_df = (abs(X[comp] - y[comp]) - vals) >= 0
    drop_rows = comp_df.any(axis = 1)
    
    search_df = X.loc[~drop_rows]
    
    indices = []
    dists = []
    for ind, row in search_df.iterrows():
        indices.append(ind)
        dists.append(sq_w_euc_dist(row, y, w))
    return [indices[ind] for ind in np.argsort(dists)[:n]]

In [32]:
def find_closest_cos(X, y, n = 1):
    indices = []
    dists = []
    for ind, row in X.iterrows():
        indices.append(ind)
        dists.append(cos_dist(row, y))
    return [indices[ind] for ind in np.argsort(dists)[:n]]