In [None]:
import numpy as np  
import pandas as pd  

import matplotlib.pyplot as plt # For plots
from sklearn.model_selection import train_test_split
import seaborn as sns
import pickle
pd.set_option('display.max_columns', None)

## Read in dataframes

In [None]:
%%time
df_appraisal_in = pd.read_pickle("./OUT_dfs/df_appr_full_processed.pkl")
df_comparable_in = pd.read_pickle("./OUT_dfs/df_comp_full_processed.pkl")

print("Appriasal Shape", df_appraisal_in.shape)
print("Comp Shape", df_comparable_in.shape)


### columns not migrated yet are:

special_columns = [ 'DESIGNSTYLE_SUBJ', 'HEATINGCOOLING_SUBJ', 'ENERGYEFF_SUBJ',
       'GARAGECARPORT_SUBJ', 'PORCHPATIODECK_SUBJ' ]


In [None]:
for c, a in zip(df_comparable_in.columns, df_appraisal_in.columns):
    if c!=a:
        print(c+"///"+a)

df_comp = df_comparable_in.copy()
df_appr = df_appraisal_in.copy()

In [None]:
# Quick fix for strings in Lat and Long
df_comp['APPRLATITUDE'] = pd.to_numeric(df_comp['APPRLATITUDE'],errors='coerce')
df_comp['APPRLONGITUDE'] = pd.to_numeric(df_comp['APPRLONGITUDE'],errors='coerce')
df_appr['APPRLATITUDE'] = pd.to_numeric(df_appr['APPRLATITUDE'],errors='coerce')
df_appr['APPRLONGITUDE'] = pd.to_numeric(df_appr['APPRLONGITUDE'],errors='coerce')

In [None]:
pd.set_option('max_info_rows', 200)
df_comp.info()


In [None]:
# print datatypes
print("Comp data types\n")
print(df_comp.dtypes.value_counts())

print("\nAppr data types\n")
print(df_appr.dtypes.value_counts())


## Checking NA percentages

13-15% Null and 0 values identified in lattitude and longitude. This is potentially an issue. Need to double check if this is right. Remaining NA values are negligible in occurence. Could remove all rows with any nans in them.

In [None]:

pd.set_option('display.max_rows', None)
counts_a =  df_appr.isna().sum()
percs_a = (counts_a/df_appr.shape[0]).mul(100).round(3).astype(str) + '%'

counts_c =  df_comp.isna().sum()
percs_c = (counts_c/df_comp.shape[0]).mul(100).round(3).astype(str) + '%'

pd.concat([counts_a,percs_a, counts_c, percs_c], 
          axis=1, keys=['na_count_a', 'na_perc_a','na_count_c', 'na_perc_c']).sort_values(['na_perc_a','na_perc_c'], ascending=False)

# Drop rows with NA values


In [None]:
def remove_na_rows_from_both_dataframes(appaisal_dataframe, comparables_dataframe):
    remove_nas_from_these_comp_columns = ['APPRLONGITUDE', 'APPRLATITUDE', 'COMPSALEDATE',"ACTUALAGE","BDRM", "TOTALRM",
                                       "SITEAREASQFT","FULL_BATH" , "HALF_BATH","COUNTY","GROSSLIVINGAREA"]
    dfc = comparables_dataframe.dropna(axis=0, subset=remove_nas_from_these_comp_columns)
    #dfc.isna().sum()
    
    #remove zeros form lat and long
    dfc = dfc.loc[(dfc[['APPRLONGITUDE', 'APPRLATITUDE']] != 0).any(axis=1)]
    
    remove_nas_from_these_appr_columns = ['APPRLONGITUDE', 'APPRLATITUDE', 'SALEDATE',"ACTUALAGE","BDRM", "TOTALRM",
                                       "SITEAREASQFT","FULL_BATH" , "HALF_BATH","COUNTY","GROSSLIVINGAREA"]
    dfa = appaisal_dataframe.dropna(axis=0, subset=remove_nas_from_these_appr_columns)
    #dfa.isna().sum()
    dfa = dfa.loc[(dfa[['APPRLONGITUDE', 'APPRLATITUDE']] != 0).any(axis=1)]
    dfa["compcount"] = dfa['SUBJ_APPR_ID'].map(dfc['SUBJ_APPR_ID'].value_counts()).fillna(0)
    dfaa = dfa[(dfa['compcount']==3)]# keep only rows with all 3 comparables
    dfcc = dfc[dfc['SUBJ_APPR_ID'].isin(dfaa["SUBJ_APPR_ID"])]#keep only rows that remain in appraisal df
    dfaa.drop('compcount', axis=1, inplace=True)
    print(dfaa.shape)
    print(dfcc.shape)
    return dfaa, dfcc

In [None]:
df_appr_nona, df_comp_nona =  remove_na_rows_from_both_dataframes(df_appr, df_comp)

In [None]:
#save dataframes with no na values to pickle files
df_comp_nona["SALEDATE"]=df_comp_nona["COMPSALEDATE"]
df_appr_nona.to_pickle("./OUT_dfs/df_appr_full_processed_nona.pkl")
df_comp_nona.to_pickle("./OUT_dfs/df_comp_full_processed_nona.pkl")

## Quick comparison table of one record - specify appraisal ID

In [None]:
def get_comparables(aID, dfs= [df_appr_nona, df_comp_nona],all_columns=False):
    df1 = dfs[0].loc[dfs[0]['SUBJ_APPR_ID'] == aID]
    df2 = dfs[1].loc[dfs[1]['SUBJ_APPR_ID'] == aID]
    df3 = pd.concat([df1,df2],axis=0).transpose()
    if all_columns==False:
        return df3.loc[(df3!=0).any(axis=1)]
    else: 
        return df3

get_comparables(857135,dfs= [df_appr_nona, df_comp_nona],all_columns=False)


## Quick comparison table of one record - specify appraisal ID

In [None]:
def get_comparables(aID, dfs= [df_appr_nona, df_comp_nona],all_columns=False):
    df1 = dfs[0].loc[dfs[0]['SUBJ_APPR_ID'] == aID]
    df2 = dfs[1].loc[dfs[1]['SUBJ_APPR_ID'] == aID]
    df3 = pd.concat([df1,df2],axis=0).transpose()
    if all_columns==False:
        return df3.loc[(df3!=0).any(axis=1)]
    else: 
        return df3

get_comparables(857135,dfs= [df_appr_nona, df_comp_nona],all_columns=False)


## Basic plotting

In [None]:
def my_countplot(col, df_list):
    for dataframe in df_list:
        sns.countplot(x=col,data=dataframe,order = dataframe[col].value_counts().index).set(title= col)
        plt.show()
    

In [None]:
my_countplot("STATE", [df_appr_nona, df_comp_nona])
    