In [None]:
import numpy as np
import pickle
import pandas as pd 
from scipy import spatial
from sklearn.neighbors import KDTree, BallTree, NearestNeighbors
pd.set_option('display.max_columns', None)

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

from scipy.spatial import distance

from operator import methodcaller
from sklearn.metrics.pairwise import cosine_distances

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import haversine_distances

from sklearn.metrics.pairwise import laplacian_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import manhattan_distances

from sklearn.metrics.pairwise import rbf_kernel

distances = [cosine_distances,
euclidean_distances,
laplacian_kernel,
manhattan_distances,
rbf_kernel
]

In [None]:
# Set up
df_c = pd.read_pickle("./OUT_dfs/df_c2.pkl")
df_a = pd.read_pickle("./OUT_dfs/df_a2.pkl")

In [None]:
df_c.columns

In [None]:
def table(d):
    """
    Accepts a dictionary in the form of appr id as int : [list of 3 indices of the 3 comps found]
    
    returns nicely formatted pandas dataframe with top3/5/10/25/50/100 accuracies
    """
    output = np.full((6, 3), 0)
    for key, value in d.items():
        entry = np.array(value)
        x = np.array([entry<3,
                    entry<5,
                    entry<10,
                    entry<25,
                    entry<50,
                    entry<100]).astype(int)
        if np.shape(x)[1] ==0:
            x = np.full((6, 3), 0)
        elif np.shape(x)[1] ==1:
            x = np.hstack((x, np.full((6, 2), 0)))
        elif np.shape(x)[1] ==2:
            x = np.hstack((x, np.full((6, 1), 0)))
        else:
            x=x.copy()
        #print(x)
        output = np.add(output, x)
    return pd.DataFrame(index=['top_3', 'top_5', 'top_10', 'top_25', 'top_50', 'top_100'], 
                         columns=['1_comps', '2_comps', '3_comps'],data=output/len(d))


In [None]:
train_a, test_a = train_test_split(df_a, train_size=5000, test_size=100, random_state=6666699)

In [None]:
%%time
# construct a ball tree with haversine distance as the metric
distance_tree = BallTree(df_c[['APPRLATITUDE', 'APPRLONGITUDE']].values, leaf_size=2, metric='haversine')


In [None]:
#migrate this to a file to be imported and reused

def knn_in_time_and_distance(appr_id, appraisal_df,comp_df, column_selection, distance_metric, tree_object,k=8000, 
                             day_distance=760,):
    """
    Given one appraisal id + appraisal and comp dataframes, the function returns k nearest comparables in distance 
    from the appraisal. First, k nearest geographical comparables are fetched using haversine distance. 
     Then all comps that happened after the appraisal are filetred out. 
    By default, the funciton also looks at only 2 years of comps (760 days) prior to appraisal date.
    
    
    INPUTS
    appr_id: chosen appraisal id 
    appraisal_df: dataframe of appraisals with lat and long in radians 
    comp_df: dataframe of appraisals with lat and long in radians
    column selection: list of transformed columns in comp df without saledate or long/lat
    distance metric: as imported from sklearn.metrics.pairwise
    tree_object: knn
    k: number of nearest neighbours in distance
    day_distance=760, only looks at 2 years of comps prior to appraisal
    
    RETURNS
    Cut down comp dataframe having less or equal to k rows with 3  new columns:
    Distance (in km) - distance of this comp from appraisal in km
    Distance (in days) - distance of this comp from appraisal in days
    Actual_comp - binary column with 1 indicating the appraisal has been picked by appraiser
    
    """
    query_appraisal = appraisal_df[appraisal_df["SUBJ_APPR_ID"]==appr_id]
    query_appraisal["Distance (in days)"]=0
    query_appraisal["Distance (in km)"]=0
    
    query_appr_saledate = query_appraisal["SALEDATE"].values[0]
    query_appraisal_compidsnew = query_appraisal[["_COMP_ID1","_COMP_ID2","_COMP_ID3"]].values.tolist()[0]
    query_point = query_appraisal[['APPRLATITUDE', 'APPRLONGITUDE']].values[0]
    
    # query the tree to k get nearest comps in km
    distances, indices = tree_object.query([query_point], k=k)
    result_df = df_c.iloc[indices[0]]
    R = 6371008.7714
    result_df['Distance (in km)'] = distances[0] * R/1000
    
    # filter out anything after the appraisal
    result_df = result_df[(result_df['SALEDATE']<query_appr_saledate)&
                      (result_df['SALEDATE']>query_appr_saledate - pd.Timedelta(day_distance, unit='d'))]
    
    result_df["Distance (in days)"] = (query_appr_saledate -result_df['SALEDATE']).dt.days
    result_df["Actual_comp"] = np.where(result_df["UNIQUECOMPIDNEW"].isin(query_appraisal_compidsnew), 1, 0)
    
    
    #scale
    scaler = MinMaxScaler()
    cs = column_selection + ["Distance (in days)", 'Distance (in km)' ]
    result_df[cs] = scaler.fit_transform(        result_df[cs])
    query_appraisal[cs] = scaler.transform( query_appraisal[cs])
    
    # calculate distance
    s = query_appraisal[cs].values.tolist()
    
    

    if "distances" in distance_metric.__name__:
        result_df['TOTAL_DISTANCE'] = distance_metric(result_df[cs].values.tolist(), s)

    else:
        result_df['TOTAL_DISTANCE'] = 1- distance_metric(result_df[cs].values.tolist(), s)

    
    return result_df

    

In [None]:
%%time
cols = ['TOTALRM', 'BDRM', 'BLGRDTOTALSQFT', 'BLGRDFINISHSQFT', 'BLGRDRECRM',
       'BLGRDBEDRM', 'BLGRDOTHERRM', 'GROSSLIVINGAREA', 'SITEAREASQFT',
       'ACTUALAGE', 'FULL_BATH', 'FULL_BLGRDBATHRM', 'HALF_BATH',
       'HALF_BLGRDBATHRM', 'Number_of_stories_no_imputation', 
       'LOCADJPARK', 'LOCADJPOWERLINE', 'LOCBUSYROAD', 'LOCCOMMERCIAL',
       'LOCGOLFCOURSE', 'LOCINDUSTRIAL', 'LOCLANDFILL', 'LOCPUBLICTRAN',
       'LOCRESIDENTIAL', 'VIEWTYPECITYSKYLINE', 'VIEWTYPECITYSTREET',
       'VIEWTYPEGOLFCOURSE', 'VIEWTYPEINDUSTRIAL', 'VIEWTYPELIMITED',
       'VIEWTYPEMOUNTAIN', 'VIEWTYPEPARK', 'VIEWTYPEPASTORAL',
       'VIEWTYPEPOWERLINE', 'VIEWTYPERESIDENTIAL', 'VIEWTYPEWATER',
       'VIEWTYPEWOOD', 'CONDITION_ALL', 'QUALITYOFCONST_ALL', 'VIEWRTG_ALL',
       'LOCRTG_ALL']

res_dict = {}
for appraisal in train_a["SUBJ_APPR_ID"].tolist():
    result_df2 = knn_in_time_and_distance(appraisal, df_a, df_c,cols,manhattan_distances,tree_object=distance_tree,k=8000, 
                                          day_distance=760)

    ranked_result_df2 = result_df2[result_df2["Actual_comp"]>=0].sort_values(by=["TOTAL_DISTANCE"]).reset_index()

    res_dict[appraisal] = ranked_result_df2[ranked_result_df2["Actual_comp"]==1].index.tolist()

table(res_dict)

# Visualise the result

In [None]:
import matplotlib.pyplot as plt

# create empty lists for each position in the lists
pos_0 = []
pos_1 = []
pos_2 = []

# iterate over the dictionary values and append the integers to the corresponding list
for value in res_dict.values():
    if len(value) >= 3:
        pos_0.append(value[0])
        pos_1.append(value[1])
        pos_2.append(value[2])

# plot the histograms
plt.hist(pos_0, bins=25, alpha=0.5, label='Comp 1')
plt.hist(pos_1, bins=25, alpha=0.5, label='Comp 2')
plt.hist(pos_2, bins=25, alpha=0.5, label='Comp 3')

# add legend and labels
plt.legend(loc='upper right')
plt.xlabel('Predicted index of all 3 comps')
plt.ylabel('Frequency')
plt.title('Final model predicted indices on test set')

# show the plot
plt.show()
