In [None]:
# Libraries
import pickle
import pandas as pd
import numpy as np

from math import sqrt, cos, sin, asin, radians

from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

from datetime import datetime
from datetime import date

import matplotlib.pyplot as plt

# Import the processed data pickles:

In [None]:
# appr = pd.read_pickle("./OUT_dfs/df_appr_full_processed.pkl")
# comp = pd.read_pickle("./OUT_dfs/df_comp_full_processed.pkl")

In [None]:
# Load the datasets:
df_appr_nona = pd.read_pickle("./OUT_dfs/df_appr_full_processed_nona.pkl")
df_comp_unique = pd.read_pickle("./OUT_dfs/uniqueComps.pkl")

# Dictionary of appraisal
dict_apprid_to_uniquecompidnew = pickle.load(open("./OUT_dfs/dict_apprid_to_uniquecompidnew", "rb"))

In [None]:
test_appr = 856775
dict_apprid_to_uniquecompidnew[test_appr]

In [None]:
# Create copies of loadings:
appr = df_appr_nona.copy()
comp = df_comp_unique.copy()

# Drop rows that have null values:

In [None]:
# Helper function to drop any NaN in the defined columns:

def remove_nan(a_df, c_df):
    a_cols = ['APPRLONGITUDE', 'APPRLATITUDE', 'SALEDATE']
    c_cols = ['APPRLONGITUDE', 'APPRLATITUDE', 'SALEDATE'] # COMPSALEDATE
    a_full = a_df.dropna(subset=a_cols)
    c_full = c_df.dropna(subset=c_cols)
    
    return a_full, c_full

In [None]:
appr_no_nan, comp_no_nan = remove_nan(appr, comp)

# Dataset for training:

In [None]:
def full_or_sample_dataset(a_df, c_df, sub_sample=None, state=None):
    if sub_sample == None and state == None:
        a_train = a_df.copy() # computationally expensive
        print('Full dataset loaded successfully!')
    elif sub_sample == None and state!=None:
        a_train = a_df[a_df['STATE'] == county]
        print('Subset from state ' + state + ' loaded successfully!')
    elif sub_sample != None and state == None:
        a_train = a_df.sample(n=sub_sample)
        print('Subset of n = ' + str(sub_sample) + ' loaded successfully!')
    else:
        a_temp =  a_df[a_df['STATE']==state]
        a_train = a_temp.sample(n=sub_sample)
        print('Subset of state' + state + ' with n = ' + str(sub_sample) +' loaded successfully!')
    
    c_train = pd.DataFrame()
    for appr_id in a_train['SUBJ_APPR_ID']:
        comp_df_temp = c_df[c_df['UNIQUECOMPIDNEW'].isin(dict_apprid_to_uniquecompidnew[appr_id])]
        c_train = pd.concat([c_train, comp_df_temp])
    
#     c_train_sample = c_df.sample(n=2000) # choose how many extra comparables to include
#     c_train = pd.concat([c_train, c_train_sample])
    c_train.drop_duplicates(subset=['UNIQUECOMPIDNEW'],keep='first', inplace=True)
    
    print('Full comparables dataset loaded successfully!')
    print('')
    print('Number of rows in appraisals dataset: ', a_train.shape[0])
    print('Number of rows in unique comparables dataset: ', c_train.shape[0])
    
    return a_train, c_train

In [None]:
%%time

appr_df, comp_df  = full_or_sample_dataset(appr_no_nan, comp_no_nan, sub_sample=10000, state=None)

# **MODELS FOR FEATURE IMPORTANCE**

In [None]:
# Columns categorization:
appr_excl_cols = ['SUBJ_APPR_ID', 'COMPNUM', 
                  'ADDRESS1', 'CITY', 'STATE', 'ZIPCODE', 'COUNTY', 'COMPSALEDATE']

comp_excl_cols = ['SUBJ_APPR_ID', 'COMPNUM', 'UNIQUECOMPIDNEW', 
                  'ADDRESS1', 'CITY', 'STATE', 'ZIPCODE', 'COUNTY', 'COMPSALEDATE']

calc_cols = ['APPRLATITUDE', 'APPRLONGITUDE', 'SALEDATE']

categorical_cols = ['LOCRTGNEUTRAL', 'LOCRTGBENEFICIAL', 'LOCRTGADVERSE', 'LOCRESIDENTIAL', 
                    'LOCINDUSTRIAL', 'LOCCOMMERCIAL', 'LOCBUSYROAD', 'LOCWATERFRONT', 'LOCGOLFCOURSE', 
                    'LOCADJPARK', 'LOCADJPOWERLINE', 'LOCLANDFILL', 'LOCPUBLICTRAN', 
                    'VIEWRTGNEUTRAL', 'VIEWRTGBENEFICIAL', 'VIEWRTGADVERSE', 'VIEWTYPEWATER', 
                    'VIEWTYPEPASTORAL', 'VIEWTYPEWOOD', 'VIEWTYPEPARK', 'VIEWTYPEGOLFCOURSE', 
                    'VIEWTYPECITYSKYLINE', 'VIEWTYPEMOUNTAIN', 'VIEWTYPERESIDENTIAL', 'VIEWTYPECITYSTREET', 
                    'VIEWTYPEINDUSTRIAL', 'VIEWTYPEPOWERLINE', 'VIEWTYPELIMITED', 
                    'QUALITYOFCONSTQ1', 'QUALITYOFCONSTQ2', 'QUALITYOFCONSTQ3', 'QUALITYOFCONSTQ4', 
                    'QUALITYOFCONSTQ5', 'QUALITYOFCONSTQ6', 
                    'CONDITIONC1', 'CONDITIONC2', 'CONDITIONC3', 'CONDITIONC4', 'CONDITIONC5', 'CONDITIONC6']

numerical_cols = ['TOTALRM', 'BDRM', 'BLGRDTOTALSQFT', 'BLGRDFINISHSQFT', 'BLGRDRECRM', 
                  'BLGRDBEDRM', 'BLGRDOTHERRM', 'GROSSLIVINGAREA', 'ACTUALAGE', 'FULL_BATH', 
                  'FULL_BLGRDBATHRM', 'HALF_BATH', 'HALF_BLGRDBATHRM', 'SITEAREASQFT']

## *Scale data:*

In [None]:
scaler = StandardScaler()

## *Principal Component Analysis:*

In [None]:
full_df = pd.concat([appr_df, comp_df], ignore_index=True, sort=False)

In [None]:
full_scaled = full_df.copy()
full_scaled[['TOTALRM', 'BDRM', 'BLGRDTOTALSQFT', 'BLGRDFINISHSQFT', 'BLGRDRECRM', 'BLGRDBEDRM', 'BLGRDOTHERRM', 
             'GROSSLIVINGAREA', 'ACTUALAGE', 'FULL_BATH', 'FULL_BLGRDBATHRM', 'HALF_BATH', 'HALF_BLGRDBATHRM', 
             'SITEAREASQFT']] = scaler.fit_transform(full_scaled[['TOTALRM', 'BDRM', 'BLGRDTOTALSQFT', 'BLGRDFINISHSQFT', 
                                                                  'BLGRDRECRM', 'BLGRDBEDRM', 'BLGRDOTHERRM', 
                                                                  'GROSSLIVINGAREA', 'ACTUALAGE', 'FULL_BATH', 
                                                                  'FULL_BLGRDBATHRM', 'HALF_BATH', 'HALF_BLGRDBATHRM', 
                                                                  'SITEAREASQFT']])

In [None]:
full_scaled.columns

In [None]:
full_scaled.drop(['SUBJ_APPR_ID', 'COMPNUM','ADDRESS1', 'CITY', 'STATE', 'ZIPCODE', 'COUNTY', 'COMPSALEDATE',
                  'APPRLATITUDE', 'APPRLONGITUDE', 'COMPSALEDATE'], 
             axis=1, inplace=True)

In [None]:
full_scaled.drop(['SALEDATE', 'UNIQUECOMPIDNEW'], 
             axis=1, inplace=True)

In [None]:
full_scaled.columns

In [None]:
pca = PCA().fit(full_scaled)

In [None]:
plt.plot(pca.explained_variance_ratio_.cumsum(), lw=2, color='darkgreen')
plt.title('Cumulative explained variance by number of principal components', size=10)
plt.xticks(range(0, 55), size=5, rotation='vertical')
plt.axvline(10, 0, 1)
plt.show()

In [None]:
full_not_scaled = full_df.copy()
full_not_scaled.drop(['SUBJ_APPR_ID', 'COMPNUM','ADDRESS1', 'CITY', 'STATE', 'ZIPCODE', 'COUNTY', 'COMPSALEDATE',
                      'APPRLATITUDE', 'APPRLONGITUDE', 'SALEDATE'], axis=1, inplace=True)

In [None]:
full_not_scaled.drop(['UNIQUECOMPIDNEW'], axis=1, inplace=True)

In [None]:
pca_loadings = pd.DataFrame(
    data=pca.components_.T * np.sqrt(pca.explained_variance_), 
    columns=[f'PC{i}' for i in range(1, len(full_not_scaled.columns) + 1)],
    index=full_not_scaled.columns
)

In [None]:
pc1_loadings = pca_loadings.sort_values(by='PC1', ascending=False)[['PC1']]
pc1_loadings = pc1_loadings.reset_index()
pc1_loadings.columns = ['Attribute', 'CorrWithPC1']

plt.bar(x=pc1_loadings['Attribute'], height=pc1_loadings['CorrWithPC1'], color='green')
plt.title('PCA loading scores (first principal component)', size=10)
plt.xticks(rotation='vertical', size=5)
plt.show()

In [None]:
pc2_loadings = pca_loadings.sort_values(by='PC2', ascending=False)[['PC2']]
pc2_loadings = pc2_loadings.reset_index()
pc2_loadings.columns = ['Attribute', 'CorrWithPC2']

plt.bar(x=pc2_loadings['Attribute'], height=pc2_loadings['CorrWithPC2'], color='green')
plt.title('PCA loading scores (second principal component)', size=10)
plt.xticks(rotation='vertical', size=5)
plt.show()

In [None]:
# Take top 10 principal components that explain approx. 90% of the variance in the dataset:
pc_df_list = []
for i in range(10):
    pc = 'PC'+str(i+1)
    corr = 'CorrWithPC'+str(i+1)
    pc_i_loadings = pca_loadings.sort_values(by=pc, ascending=False)[[pc]]
    pc_i_loadings.columns = [corr]
    pc_df_list.append(pc_i_loadings)

In [None]:
pc_df = pd.concat(pc_df_list, axis="columns")
pc_df.tail()

In [None]:
# Convert correlations into absolute values (does not matter if positive or negative)
pc_df = pc_df.abs()
pc_df.tail()

In [None]:
pc_df = pc_df.drop(['TotalCorrelation'], axis=1, errors='ignore') # in case it already exists, drop
pc_df["TotalCorrelation"] = pc_df.sum(axis=1)
pc_df.tail()

In [None]:
final_pc_df = pc_df.sort_values(by=['TotalCorrelation'], ascending=False)
final_pc_df.head()

In [None]:
# Top 20 features:
n_var = 20
result = final_pc_df['TotalCorrelation'][:n_var]
result

In [None]:
# pca_features = list(result.index)
# pca_features.append('APPRLONGITUDE')
# pca_features.append('APPRLATITUDE')
# pca_features.append('SALEDATE')
# pca_features

In [None]:
# Create copies of loadings:
# pca_feat_appr = df_appr_nona.copy()
# pca_feat_appr = pca_feat_appr[pca_features]
# pca_feat_comp = df_comp_unique.copy()
# pca_feat_comp = pca_feat_comp[pca_features]

In [None]:
# pca_feat_appr.to_pickle("./OUT_dfs/pca_feat_appr.pkl")
# pca_feat_comp.to_pickle("./OUT_dfs/pca_feat_comp.pkl")