In [None]:
# Source: https://www.vibhuagrawal.com/blog/geospatial-nearest-neighbor-search

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
import pickle

import random

R = 6371008.7714
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def haversine_distance(lat_1, lon_1, lat_2, lon_2):
    return 2 * R * (np.arcsin((np.sin((lat_2 - lat_1) / 2) ** 2 + \
                               np.cos(lat_1) * np.cos(lat_2) * \
                               np.sin((lon_2 - lon_1) / 2) ** 2) ** 0.5))

In [None]:

df_appr_nona = pd.read_pickle("./OUT_dfs/df_appr_full_processed_nona.pkl")#
df_a = df_appr_nona.copy()

dict_apprid_to_uniquecompidnew = pickle.load(open("./OUT_dfs/dict_apprid_to_uniquecompidnew", "rb"))
uniqueComps = pd.read_pickle("./OUT_dfs/uniqueComps.pkl")

d_list = list(dict_apprid_to_uniquecompidnew.items())
uniqueComps.columns

In [None]:
df_aa = df_appr_nona.copy()
df_aa[['_COMP_ID1', '_COMP_ID2', '_COMP_ID3']] = df_aa['SUBJ_APPR_ID'].apply(lambda x: pd.Series(dict_apprid_to_uniquecompidnew[x]))


# Calculate distances from appraisal to each of its comps

In [None]:
df_a = df_aa.copy()
df_a.shape

In [None]:
# comp 1
df_temp_11 = uniqueComps[[ "UNIQUECOMPIDNEW",'APPRLATITUDE', 'APPRLONGITUDE']]\
.rename(columns={'UNIQUECOMPIDNEW': '_COMP_ID1','APPRLATITUDE': '_COMP1_LAT', 'APPRLONGITUDE': '_COMP1_LON'})
df_a = df_a.merge(df_temp_11, on='_COMP_ID1', how='left')
# comp 2
df_temp_11 = uniqueComps[[ "UNIQUECOMPIDNEW",'APPRLATITUDE', 'APPRLONGITUDE']]\
.rename(columns={'UNIQUECOMPIDNEW': '_COMP_ID2','APPRLATITUDE': '_COMP2_LAT', 'APPRLONGITUDE': '_COMP2_LON'})
df_a = df_a.merge(df_temp_11, on='_COMP_ID2', how='left')
# comp 3
df_temp_11 = uniqueComps[[ "UNIQUECOMPIDNEW",'APPRLATITUDE', 'APPRLONGITUDE']]\
.rename(columns={'UNIQUECOMPIDNEW': '_COMP_ID3','APPRLATITUDE': '_COMP3_LAT', 'APPRLONGITUDE': '_COMP3_LON'})
df_a = df_a.merge(df_temp_11, on='_COMP_ID3', how='left')

In [None]:
# convert to radians
df_z=df_a[['APPRLATITUDE', 'APPRLONGITUDE',"_COMP1_LAT", "_COMP1_LON","_COMP2_LAT", "_COMP2_LON","_COMP3_LAT", "_COMP3_LON"]].apply(np.radians,axis=0)

# caluclate distances in km
df_z["_COMP_1_DISTKM"] = haversine_distance(df_z['APPRLATITUDE'], df_z['APPRLONGITUDE'], df_z['_COMP1_LAT'], df_z['_COMP1_LON'])/1000
df_z["_COMP_2_DISTKM"] = haversine_distance(df_z['APPRLATITUDE'], df_z['APPRLONGITUDE'], df_z['_COMP2_LAT'], df_z['_COMP2_LON'])/1000
df_z["_COMP_3_DISTKM"] = haversine_distance(df_z['APPRLATITUDE'], df_z['APPRLONGITUDE'], df_z['_COMP3_LAT'], df_z['_COMP3_LON'])/1000

# calculate  mean of all 3 distances
df_z["_COMP_DISTKM_AVG"] = np.mean(df_z[['_COMP_1_DISTKM','_COMP_2_DISTKM','_COMP_3_DISTKM']],axis=1)

In [None]:
# merge with main appraisal df
df_a = df_a.merge(df_z[["_COMP_1_DISTKM","_COMP_2_DISTKM","_COMP_3_DISTKM","_COMP_DISTKM_AVG"]], left_index=True, right_index=True, how='left')

In [None]:


data1= df_a[(df_a["_COMP_DISTKM_AVG"]<=100) ] #& (df_a["COUNTY"]=="COOK")[(df_a["STATE"]=="IL") ]

x1=list(np.array(data1["_COMP_DISTKM_AVG"])) 
y1=list(np.array(data1["STATE"])) 
hue1=list(np.array(data1["CITY"])) 

sns.set_theme(style="whitegrid")

# Initialize the figure
f, ax = plt.subplots()
sns.despine(bottom=True, left=True)

# Show each observation with a scatterplot
sns.stripplot(
    data=data1, x=x1, y=y1,
    dodge=True, alpha=.25, zorder=1, legend=False
)

sns.pointplot(
    data=data1, x=x1, y=y1,
    join=False, dodge=.8 - .8 / 3, palette="dark",
    markers="d", scale=.75, errorbar=None).set(title='Average Appr-Comp distance by state (km)')



In [None]:
maximum =max(df_a["_COMP_DISTKM_AVG"])
print(maximum)
plt.hist(df_a["_COMP_DISTKM_AVG"], edgecolor='black', bins=[10, 20, 30, 40, 50, 60,70,80,90,100])


In [None]:
df_a["_COMP_DISTKM_AVG"].shape

In [None]:
# calculate percentiles every 0.01% to identify where outliers lie
j=0
for i in np.round(np.percentile(df_a["_COMP_DISTKM_AVG"], np.arange(0, 100, 0.01)),4):
    print(j/100,"%:", i)
    j+=1

In [None]:
# Propose to remove outliers with average distance of 0 and above 50km. Being top and bottom 0.03% of comps.
# Could even remove more

df_a = df_a[~((df_a["_COMP_DISTKM_AVG"]==0) | (df_a["_COMP_DISTKM_AVG"]>=50))]



# Look at distance in time

In [None]:
# comp 1
df_temp_11 = uniqueComps[[ "UNIQUECOMPIDNEW",'SALEDATE']]\
.rename(columns={'UNIQUECOMPIDNEW': '_COMP_ID1','SALEDATE': '_COMP1_SALEDATE'})
df_a = df_a.merge(df_temp_11, on='_COMP_ID1', how='left')
# comp 2
df_temp_11 = uniqueComps[[ "UNIQUECOMPIDNEW",'SALEDATE']]\
.rename(columns={'UNIQUECOMPIDNEW': '_COMP_ID2','SALEDATE': '_COMP2_SALEDATE'})
df_a = df_a.merge(df_temp_11, on='_COMP_ID2', how='left')
# comp 3
df_temp_11 = uniqueComps[[  "UNIQUECOMPIDNEW",'SALEDATE']]\
.rename(columns={'UNIQUECOMPIDNEW': '_COMP_ID3','SALEDATE': '_COMP3_SALEDATE'})
df_a = df_a.merge(df_temp_11, on='_COMP_ID3', how='left')

In [None]:
df_a

In [None]:
df_z=df_a[['SALEDATE',"_COMP1_SALEDATE","_COMP2_SALEDATE","_COMP3_SALEDATE"]]

df_z["_COMP_1_DISTDAYS"] = df_a['SALEDATE']-df_a["_COMP1_SALEDATE"]
df_z["_COMP_2_DISTDAYS"] = df_a['SALEDATE']-df_a["_COMP2_SALEDATE"]
df_z["_COMP_3_DISTDAYS"] = df_a['SALEDATE']-df_a["_COMP3_SALEDATE"]

# calculate  mean of all 3 distances
df_z["_COMP_DISTDAYS_AVG"] = np.mean(df_z[['_COMP_1_DISTDAYS','_COMP_2_DISTDAYS','_COMP_3_DISTDAYS']],axis=1)
df_z

In [None]:
# calculate percentiles every 0.01% to identify where outliers lie
j=0
for i in np.round(np.percentile((df_z["_COMP_DISTDAYS_AVG"]).dt.days, np.arange(0, 100, 0.01)),4):
    print(j/100,"%:", i)
    j+=1

In [None]:
# Remove everyting less than 14 days and more than 2 years. could do more or less than 14 days

df_z = df_z[~( (df_z["_COMP_DISTDAYS_AVG"].dt.days<=14)|(df_z["_COMP_DISTDAYS_AVG"].dt.days>= 730) )]

In [None]:
df_a = df_a.merge(df_z[["_COMP1_SALEDATE","_COMP2_SALEDATE","_COMP3_SALEDATE","_COMP_1_DISTDAYS","_COMP_2_DISTDAYS","_COMP_3_DISTDAYS","_COMP_DISTDAYS_AVG"]], left_index=True, right_index=True, how='right')
df_a

In [None]:
df_a.to_pickle("./OUT_dfs/df_appr_full_processed_nona_no_outliers.pkl")

In [None]:

data1= df_a #& (df_a["COUNTY"]=="COOK")[(df_a["STATE"]=="IL") ]

x1=list(np.array(data1["_COMP_DISTDAYS_AVG"].dt.days)) 
y1=list(np.array(data1["STATE"])) 
hue1=list(np.array(data1["CITY"])) 

sns.set_theme(style="whitegrid")
iris = sns.load_dataset("iris")

# Initialize the figure
f, ax = plt.subplots()
sns.despine(bottom=True, left=True)

# Show each observation with a scatterplot
sns.stripplot(
    data=data1, x=x1, y=y1,
    dodge=True, alpha=.25, zorder=1, legend=False
)

sns.pointplot(
    data=data1, x=x1, y=y1,
    join=False, dodge=.8 - .8 / 3, palette="dark",
    markers="d", scale=.75, errorbar=None).set(title='Average Appr-Comp distance by state (in days from appraisal)')