In [7]:
from src import *

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns

def euclidean(point, data):
    return np.sqrt(np.sum((data - point)**2, axis=1))

def w_inverse_LMAE(lmae):
    """
    Inverse Local MAE
    """
    return 1/lmae

def w_inverse_log_LMAE(residuals):
    """
    Inverse Log Local MAE
    """
    np.log(max(abs(residuals))/mean_absolute_error(residuals))


def mean_absolute_error(actual, predicted):
    """
    Local Mean Absolute Error (LMAE)
    """
    return np.mean(np.abs(predicted - actual))


def root_mean_squared_error(actual, predicted):
    """
    Local Root Mean Squared Error (LRMSE)
    """
    return np.sqrt(np.mean((predicted - actual)**2))


def weighted_mean_absolute_error(actual, predicted, weights):
    """
    Weighted Local Mean Absolute Error (WLMAE)
    """
    return np.average(np.abs(predicted - actual), weights=weights)

In [30]:
import pandas as pd
import numpy as np

# A function which return the most similar elements to a point
def get_k_nearest_neighbors(point, data, k, metric):
    distances = metric(point, data)
    return distances.argsort()[:k]

def get_k_nearest_neighbors_weights(point, data, k, metric, weights):
    distances = metric(point, data)
    return distances.argsort()[:k], weights(distances)

# A function that predicts the value of a point given many different models provided as input using the inverse of the LMAE as weights
def predict_inverse_LMAE(point, data, k, metric):
    neighbors = get_k_nearest_neighbors(point, data, k, metric)
    return np.average(data[neighbors], axis=0, weights=w_inverse_LMAE)

# A function that evalaute the error bias associated to each machine learning forecast diveded by the number of forecasts
def error_bias(data, k, metric):
    error_bias = []
    for i in range(len(data)):
        neighbors = get_k_nearest_neighbors(data[i], data, k, metric)
        error_bias.append(np.sum(data[neighbors] - data[i])/k)
    return error_bias

class KNeighborsSpotter():
    def __init__(self, k=5, dist_metric=euclidean):
        self.k = k
        self.dist_metric = dist_metric
        
    def fit(self, X_val, y_val):
        self.X_val = X_val
        self.y_val = y_val

    def find_similar_neighbors(self, x):

        distances = self.dist_metric(x, self.X_val)
        y_sorted = [y for _, y in sorted(zip(distances, self.y_val, self.X_val))]
        return y_sorted[:self.k]

    def predict(self, X_test, pred_columns, target_column, weight_function=w_inverse_LMAE, bias=False):

        weights = []
        biases = []

        for x in X_test.iterrows():
            neighbors = self.find_similar_neighbors(x)
            
            for column in pred_columns:
                preds_val = neighbors[column]
                target_val = neighbors[target_column]
                w = weight_function(target_val, preds_val)
                weights.append(w)
                biases = (target_val - preds_val) / len(target_val)
        if bias:
            return (X_test[pred_columns] * np.array(weights).T) + biases/ sum(weights)
        else:
            return (X_test[pred_columns]*np.array(weights).T) / sum(weights)

In [14]:
import numpy as np

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.tree import DecisionTreeRegressor
import numpy as np

california_housing = fetch_california_housing(as_frame=True)

training = california_housing.frame[california_housing.feature_names]
target = california_housing.frame[california_housing.target_names]

X_train, X_test, y_train, y_test = train_test_split(training, target, random_state=1234, test_size=0.20)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=1234, test_size=0.20)

In [27]:
alphas = np.logspace(-3, 1, num=30)
model = make_pipeline(StandardScaler(), 
                      RidgeCV(alphas=alphas))

tree_one = DecisionTreeRegressor(max_depth=3, random_state=0)
tree_two = DecisionTreeRegressor(max_depth=10, random_state=0)

model = make_pipeline(StandardScaler(), 
                      RidgeCV(alphas=alphas))

tree_one.fit(X_train, y_train)
tree_two.fit(X_train, y_train)

X_validation["one_preds"]=tree_one.predict(X_validation[california_housing.feature_names])
X_validation["two_preds"]=tree_two.predict(X_validation[california_housing.feature_names])

In [34]:
ensemble = KNeighborsSpotter(5)
ensemble.fit(X_validation, y_validation)

In [35]:
ensemble(X_test, ["one_preds", "two_preds"], [california_housing.target_names])

TypeError: 'KNeighborsSpotter' object is not callable