In [None]:
# Libraries
import pickle
import pandas as pd
import numpy as np

from math import sqrt, cos, sin, asin, radians

from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier

from datetime import datetime
from datetime import date

import matplotlib.pyplot as plt

# Load datasets:

In [None]:
# Load the datasets:
df_appr_nona = pd.read_pickle("./OUT_dfs/df_appr_full_processed_nona.pkl")
df_comp_unique = pd.read_pickle("./OUT_dfs/uniqueComps.pkl")

# Dictionary of appraisal
dict_apprid_to_uniquecompidnew = pickle.load(open("./OUT_dfs/dict_apprid_to_uniquecompidnew", "rb"))

In [None]:
test_appr = 856775
dict_apprid_to_uniquecompidnew[test_appr]

In [None]:
# Create copies of loadings:
appr = df_appr_nona.copy()
comp = df_comp_unique.copy()

# Drop rows that have null values:

In [None]:
# Helper function to drop any NaN in the defined columns:

def remove_nan(a_df, c_df):
    a_cols = ['APPRLONGITUDE', 'APPRLATITUDE', 'SALEDATE']
    c_cols = ['APPRLONGITUDE', 'APPRLATITUDE', 'SALEDATE'] # COMPSALEDATE
    a_full = a_df.dropna(subset=a_cols)
    c_full = c_df.dropna(subset=c_cols)
    
    return a_full, c_full

In [None]:
appr_no_nan, comp_no_nan = remove_nan(appr, comp)

In [None]:
comp_no_nan.loc[comp_no_nan['UNIQUECOMPIDNEW'].isin(dict_apprid_to_uniquecompidnew[856775])]

# Dataset for training:

In [None]:
# def full_or_sample_dataset(a_df, c_df, sub_sample=None, state=None):
#     if sub_sample == None and state == None:
#         a_train = a_df.copy() # computationally expensive
#         c_train = c_df[c_df['SUBJ_APPR_ID'].isin(a_train['SUBJ_APPR_ID'])]
#         print('Full dataset loaded successfully!')
#     elif sub_sample == None and state!=None:
#         a_train = a_df[a_df['STATE'] == county]
#         c_train = c_df[c_df['SUBJ_APPR_ID'].isin(a_train['SUBJ_APPR_ID'])]
#         print('Subset from state ' + state + ' loaded successfully!')
#     elif sub_sample != None and state == None:
#         a_train = a_df.sample(n=sub_sample)
#         c_train = c_df[c_df['SUBJ_APPR_ID'].isin(a_train['SUBJ_APPR_ID'])]
#         print('Subset of n = ' + str(sub_sample) + ' loaded successfully!')
#     else:
#         a_temp =  a_df[a_df['STATE']==state]
#         a_train = a_temp.sample(n=sub_sample)
#         c_train = c_df[c_df['SUBJ_APPR_ID'].isin(a_train['SUBJ_APPR_ID'])]
#         print('Subset of state' + state + ' with n = ' + str(sub_sample) +' loaded successfully!')
    
#     if a_train.shape[0] *3 != c_train.shape[0]:
#         raise RuntimeError(f'Appraisals: {a_train.shape[0]}. Comparables: {c_train.shape[0]}')
#     print('')
#     print('Number of rows in appraisals dataset: ', a_train.shape[0])
#     print('Number of rows in comparables dataset: ', c_train.shape[0])
    
#     return a_train, c_train  

In [None]:
def full_or_sample_dataset(a_df, c_df, sub_sample=None, state=None):
    if sub_sample == None and state == None:
        a_train = a_df.copy() # computationally expensive
        print('Full dataset loaded successfully!')
    elif sub_sample == None and state!=None:
        a_train = a_df[a_df['STATE'] == county]
        print('Subset from state ' + state + ' loaded successfully!')
    elif sub_sample != None and state == None:
        a_train = a_df.sample(n=sub_sample)
        print('Subset of n = ' + str(sub_sample) + ' loaded successfully!')
    else:
        a_temp =  a_df[a_df['STATE']==state]
        a_train = a_temp.sample(n=sub_sample)
        print('Subset of state' + state + ' with n = ' + str(sub_sample) +' loaded successfully!')
    
    c_train = pd.DataFrame()
    for appr_id in a_train['SUBJ_APPR_ID']:
        comp_df_temp = c_df[c_df['UNIQUECOMPIDNEW'].isin(dict_apprid_to_uniquecompidnew[appr_id])]
        c_train = pd.concat([c_train, comp_df_temp])
    
    c_train_sample = c_df.sample(n=1000) # choose how many extra comparables to include
    c_train = pd.concat([c_train, c_train_sample])
    c_train.drop_duplicates(subset=['UNIQUECOMPIDNEW'],keep='first', inplace=True)
    
    print('Full comparables dataset loaded successfully!')
    print('')
    print('Number of rows in appraisals dataset: ', a_train.shape[0])
    print('Number of rows in unique comparables dataset: ', c_train.shape[0])
    
    return a_train, c_train

In [None]:
%%time

appr_df, comp_df  = full_or_sample_dataset(appr_no_nan, comp_no_nan, sub_sample=2500, state=None)

In [None]:
comp_df.columns

# Feature Importance:

In [None]:
# Columns categorization:
appr_excl_cols = ['COMPNUM', 'ADDRESS1', 'CITY', 'STATE', 'ZIPCODE', 'COUNTY', 'COMPSALEDATE']

comp_excl_cols = ['SUBJ_APPR_ID', 'COMPNUM', 'ADDRESS1', 'CITY', 'STATE', 'ZIPCODE', 'COUNTY', 'COMPSALEDATE']

calc_cols = ['APPRLATITUDE', 'APPRLONGITUDE', 'SALEDATE']

appr_cat_cols =    ['SUBJ_APPR_ID',
                    'LOCRTGNEUTRAL', 'LOCRTGBENEFICIAL', 'LOCRTGADVERSE', 'LOCRESIDENTIAL', 
                    'LOCINDUSTRIAL', 'LOCCOMMERCIAL', 'LOCBUSYROAD', 'LOCWATERFRONT', 'LOCGOLFCOURSE', 
                    'LOCADJPARK', 'LOCADJPOWERLINE', 'LOCLANDFILL', 'LOCPUBLICTRAN', 
                    'VIEWRTGNEUTRAL', 'VIEWRTGBENEFICIAL', 'VIEWRTGADVERSE', 'VIEWTYPEWATER', 
                    'VIEWTYPEPASTORAL', 'VIEWTYPEWOOD', 'VIEWTYPEPARK', 'VIEWTYPEGOLFCOURSE', 
                    'VIEWTYPECITYSKYLINE', 'VIEWTYPEMOUNTAIN', 'VIEWTYPERESIDENTIAL', 'VIEWTYPECITYSTREET', 
                    'VIEWTYPEINDUSTRIAL', 'VIEWTYPEPOWERLINE', 'VIEWTYPELIMITED', 
                    'QUALITYOFCONSTQ1', 'QUALITYOFCONSTQ2', 'QUALITYOFCONSTQ3', 'QUALITYOFCONSTQ4', 
                    'QUALITYOFCONSTQ5', 'QUALITYOFCONSTQ6', 
                    'CONDITIONC1', 'CONDITIONC2', 'CONDITIONC3', 'CONDITIONC4', 'CONDITIONC5', 'CONDITIONC6']

comp_cat_cols =    ['UNIQUECOMPIDNEW',
                    'LOCRTGNEUTRAL', 'LOCRTGBENEFICIAL', 'LOCRTGADVERSE', 'LOCRESIDENTIAL', 
                    'LOCINDUSTRIAL', 'LOCCOMMERCIAL', 'LOCBUSYROAD', 'LOCWATERFRONT', 'LOCGOLFCOURSE', 
                    'LOCADJPARK', 'LOCADJPOWERLINE', 'LOCLANDFILL', 'LOCPUBLICTRAN', 
                    'VIEWRTGNEUTRAL', 'VIEWRTGBENEFICIAL', 'VIEWRTGADVERSE', 'VIEWTYPEWATER', 
                    'VIEWTYPEPASTORAL', 'VIEWTYPEWOOD', 'VIEWTYPEPARK', 'VIEWTYPEGOLFCOURSE', 
                    'VIEWTYPECITYSKYLINE', 'VIEWTYPEMOUNTAIN', 'VIEWTYPERESIDENTIAL', 'VIEWTYPECITYSTREET', 
                    'VIEWTYPEINDUSTRIAL', 'VIEWTYPEPOWERLINE', 'VIEWTYPELIMITED', 
                    'QUALITYOFCONSTQ1', 'QUALITYOFCONSTQ2', 'QUALITYOFCONSTQ3', 'QUALITYOFCONSTQ4', 
                    'QUALITYOFCONSTQ5', 'QUALITYOFCONSTQ6', 
                    'CONDITIONC1', 'CONDITIONC2', 'CONDITIONC3', 'CONDITIONC4', 'CONDITIONC5', 'CONDITIONC6']

numerical_cols = ['TOTALRM', 'BDRM', 'BLGRDTOTALSQFT', 'BLGRDFINISHSQFT', 'BLGRDRECRM', 
                  'BLGRDBEDRM', 'BLGRDOTHERRM', 'GROSSLIVINGAREA', 'ACTUALAGE', 'FULL_BATH', 
                  'FULL_BLGRDBATHRM', 'HALF_BATH', 'HALF_BLGRDBATHRM', 'SITEAREASQFT']

## *Scale data:*

In [None]:
# RF does not need scaling:
# scaler = StandardScaler()

In [None]:
# Scale and drop unnecessary columns in comparables dataset:
comp_rf_scaled = comp_df.copy()
# comp_rf_scaled[numerical_cols] = scaler.fit_transform(comp_rf_scaled[numerical_cols])
comp_rf_scaled.drop(comp_excl_cols, axis=1, inplace=True)

In [None]:
# Drop unnecessary columns:
appr_rf_scaled = appr_df.copy()
# appr_rf_scaled[numerical_cols] = scaler.fit_transform(appr_rf_scaled[numerical_cols])
appr_rf_scaled.drop(appr_excl_cols, axis=1, inplace=True)

In [None]:
appr_rf_scaled.shape

## *Helper functions:*

In [None]:
def haversine_distance(long1, lat1, long2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified with latitude and longitude)
    Args:
        long1:  longitude from appraisal
        lat1:   latitude from appraisal
        long2:  longitude from comparable
        lat2:   latitude from comparable
    """
    R = 6371 # radius km of the earth
    long1, lat1, long2, lat2 = map(radians, [long1, lat1, long2, lat2])
    dlong = long2 - long1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlong/2)**2
    c =  2 * asin(sqrt(a))
    dist_km = R * c
    
    return dist_km

In [None]:
# def haversine_distance_quick(long1, lat1, long2, lat2):
#     R = 6371
#     x = (radians(long2) - radians(long1)) * cos(0.5* (radians(lat2) + radians(lat2)))
#     y = radians(lat2) - radians(lat1)
#     dist_km = R * sqrt(x*x + y*y)
    
#     return dist_km

In [None]:
# def haversine_distance_superquick(long1, lat1, long2, lat2):
#     x = lat2 - lat1
#     y = (long2 - long1) * cos((lat2 + lat1)* 0.00872664626)
#     dist_km = 111.319 * sqrt(x*x + y*y)
    
#     return dist_km

In [None]:
# long1 = float(appr_df.iloc[0]['APPRLONGITUDE'])
# lat1 = float(appr_df.iloc[0]['APPRLATITUDE'])

# comp_df['DISTTOAPPR'] = comp_df.apply(lambda x: haversine_distance(long1, lat1, float(x['APPRLONGITUDE']), float(x['APPRLATITUDE'])), axis=1)

In [None]:
# comp_df['DISTTOAPPR']

In [None]:
# Drop the test created columns:
# comp_df.drop(['DISTTOAPPR'], axis=1, inplace=True)

## *Random forest classifier:*

In [None]:
cols = ['LOCRTGNEUTRAL', 'LOCRTGBENEFICIAL', 'LOCRTGADVERSE',
       'LOCRESIDENTIAL', 'LOCINDUSTRIAL', 'LOCCOMMERCIAL', 'LOCBUSYROAD',
       'LOCWATERFRONT', 'LOCGOLFCOURSE', 'LOCADJPARK', 'LOCADJPOWERLINE',
       'LOCLANDFILL', 'LOCPUBLICTRAN', 'VIEWRTGNEUTRAL', 'VIEWRTGBENEFICIAL',
       'VIEWRTGADVERSE', 'VIEWTYPEWATER', 'VIEWTYPEPASTORAL', 'VIEWTYPEWOOD',
       'VIEWTYPEPARK', 'VIEWTYPEGOLFCOURSE', 'VIEWTYPECITYSKYLINE',
       'VIEWTYPEMOUNTAIN', 'VIEWTYPERESIDENTIAL', 'VIEWTYPECITYSTREET',
       'VIEWTYPEINDUSTRIAL', 'VIEWTYPEPOWERLINE', 'VIEWTYPELIMITED',
       'QUALITYOFCONSTQ1', 'QUALITYOFCONSTQ2', 'QUALITYOFCONSTQ3',
       'QUALITYOFCONSTQ4', 'QUALITYOFCONSTQ5', 'QUALITYOFCONSTQ6',
       'CONDITIONC1', 'CONDITIONC2', 'CONDITIONC3', 'CONDITIONC4',
       'CONDITIONC5', 'CONDITIONC6',
       'TOTALRM', 'BDRM', 'BLGRDTOTALSQFT', 'BLGRDFINISHSQFT', 'BLGRDRECRM',
       'BLGRDBEDRM', 'BLGRDOTHERRM', 'GROSSLIVINGAREA',
       'ACTUALAGE', 'FULL_BATH', 'FULL_BLGRDBATHRM', 'HALF_BATH',
       'HALF_BLGRDBATHRM', 'SITEAREASQFT', 'DISTTOAPPR', 'TIMEDIFF']

In [None]:
%%time

rf_importances = pd.DataFrame(cols, columns=['Features'])
count = 0

for appr_id in appr_rf_scaled['SUBJ_APPR_ID']:
    # Filter the appraisals dataset:
    appr_df_temp = appr_rf_scaled.loc[appr_rf_scaled['SUBJ_APPR_ID'] == appr_id]
    long1 = appr_df_temp['APPRLONGITUDE']
    lat1 = appr_df_temp['APPRLATITUDE']
    appr_sale_date = appr_df_temp.loc[appr_df_temp['SUBJ_APPR_ID'] == appr_id,'SALEDATE'].values[0]
    
    # Work the comparables dataset:
    comp_df_temp = comp_rf_scaled.copy()
    comp_df_temp['DISTTOAPPR'] = comp_df_temp.apply(lambda x: haversine_distance(long1, lat1, float(x['APPRLONGITUDE']), float(x['APPRLATITUDE'])), axis=1)
    comp_df_temp['APPRSALEDATE'] = appr_sale_date
    comp_df_temp['TIMEDIFF'] = (appr_sale_date - comp_df_temp['SALEDATE']).dt.days
    comp_df_temp['SELECTED'] = 0
    comp_df_temp.loc[comp_df_temp['UNIQUECOMPIDNEW'].isin(dict_apprid_to_uniquecompidnew[appr_id]), 'SELECTED'] = 1
    
    # Filter the comparables dataset:
    comp_df_temp2 = comp_df_temp.loc[(comp_df_temp['TIMEDIFF'] >= 0) & (comp_df_temp['DISTTOAPPR'] <= 50)]
    
    # Prepare training dataset:
    y_train = comp_df_temp2['SELECTED']
    X_train = comp_df_temp2.drop(['UNIQUECOMPIDNEW', 'SALEDATE', 'APPRSALEDATE', 'SELECTED', 'APPRLATITUDE', 'APPRLONGITUDE'], axis=1)
    
    # Fit the random forest:
    rf = RandomForestClassifier(n_estimators=100, random_state=1, max_features=0.33)
    rf.fit(X_train, y_train)
    
    data = list(zip(rf.feature_names_in_, rf.feature_importances_))
    df_importances = pd.DataFrame(data, columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)
    df_importances = df_importances.assign(Ranking=range(len(df_importances)))
    df_importances_relevant = df_importances.copy()
    col_name = 'Ranking_' + str(count)
    df_importances_relevant.rename(columns={"Ranking": col_name}, inplace = True)
    df_importances_relevant.drop(['Feature','Importance'], axis=1, inplace=True)
    
    rf_importances = pd.concat([rf_importances, df_importances_relevant], axis=1)
    
    count = count+1

In [None]:
rf_importances.head()

In [None]:
rf_importances["TotalRanking"] = rf_importances.sum(axis=1)
rf_importances.head()

In [None]:
final_rf_df = rf_importances.sort_values(by=['TotalRanking'], ascending=True)
final_rf_df[['Features', 'TotalRanking']].head(30)

In [None]:
result = final_rf_df[['Features', 'TotalRanking']].head(30)
result