In [7]:
import pandas as pd
import numpy as np
import random

In [8]:
def sq_w_euc_dist(A, B, w = None):
    if w is None: 
        return np.mean((A - B) ** 2)
    return np.sum(((A - B) ** 2) * w)

In [9]:
def cos_dist(A, B):
    return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))

In [10]:
def w_randomizer(n):
    weights = []
    for i in range(n):
        weights.append(random.random())
    weights /= np.sum(weights)
    return weights

In [11]:
def find_closest_euc(X, y, n = 1, w = None):
    """
    Inputs:
        X - dataframe of rows to compare between
        y - target row to compare to
        n - number of closest rows
        w - weights for the euclidean distance function
    
    Output:
        sim_list - list of most similar values
    """
    indices = []
    dists = []
    for ind, row in X.iterrows():
        indices.append(ind)
        dists.append(sq_w_euc_dist(row, y, w))
    return [indices[ind] for ind in np.argsort(dists)[:n]]

In [None]:
def find_closest_cos(X, y, n = 1):
    indices = []
    dists = []
    for ind, row in X.iterrows():
        indices.append(ind)
        dists.append(cos_dist(row, y))
    return [indices[ind] for ind in np.argsort(dists)[:n]]

In [17]:
def find_approx_closest(X, y, comp, vals, n = 1, num_weights = 0):
    """
    Inputs:
        X - dataframe of rows to compare between
        y - target row to compare to
        n - number of closest rows
        w - weights for the euclidean distance function
        comp - dimensions to pre-compare along
        vals - absolute differences that drops vals from consideration
    
    Output:
        sim_list - list of most similar values
    """

    comp_df = (abs(X[comp] - y[comp]) - vals) >= 0
    drop_rows = comp_df.any(axis = 1)
    
    search_df = X.loc[~drop_rows]
    
    indices = []
    dists = []
    
    to_return = {}
    to_return["cos"] = find_closest_cos(X, y, n)
    to_return["euc_unweighted"] = find_closest_euc(search_df, y, n)
    for i in range(num_weights):
        to_return[f"euc_weighted_{i + 1}"] = find_closest_euc(search_df, y, n, w_randomizer(len(y)))

    return to_return

In [5]:
# def treatment_effect_grid(yes_df, no_df, main_df, comparables, checks, check_vals, neighbors = 10, random = None):
    
#     search_df = yes_df[comparables].copy()
#     compare_df = no_df[comparables].copy()
    
#     w = None
    
#     if random:
#         w = w_randomizer(len(comparables)) 
    
#     for idx, row in search_df.iterrows():
#         closest = find_approx_closest_euc(compare_df[no_df.date == yes_df.date[idx]].copy(), row,checks,check_vals, neighbors, w)
#         nearest_idx[(yes_df.fips[idx], yes_df.date[idx])] = no_df.fips[closest].values
        
#     low_count = 0
#     for key in nearest_idx:
#         low_count += len(nearest_idx[key]) < neighbors
        
#     treatment_effect = []
#     counterfactual_list = []
#     outcome = []
#     fips_list = []
#     treatment_outcome = "past_week_cases"
#     time_frame = 14

#     for key, val in nearest_idx.items():

#         check_date = key[1] + timedelta(days = time_frame)
#         actual = main_df[(main_df.fips == key[0]) & (main_df.date == check_date)][treatment_outcome].values.item()
#         if np.isnan(actual):
#             continue

#         if len(val) == 0:
#             continue

#         counterfactual = np.nanmean(main_df[(main_df.fips.isin(val)) & (main_df.date == check_date)][treatment_outcome].values)

#         if np.isnan(counterfactual):
#             continue

#         fips_list.append(key[0])
#         counterfactual_list.append(counterfactual)
#         outcome.append(actual)
#         treatment_effect.append(actual - counterfactual)

#     treatment_df = pd.DataFrame({'fips': fips_list, 'effect':treatment_effect, 'actual':outcome, 'counter':counterfactual_list})

#     avg_treatment = np.mean(treatment_df.effect)
#     avg_treatment.sort()
#     treatment_ci = (np.percentile(avg_treatment, 2.5), np.percentile(avg_treatment, 97.5))

#     return [avg_treatment, treatment_ci, low_count]