In [None]:
import numpy as np  
import pandas as pd  

import matplotlib.pyplot as plt # For plots
from sklearn.model_selection import train_test_split
import seaborn as sns
import pickle
import datetime
import re
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
df_a = pd.read_pickle("./OUT_dfs/df_a1.pkl")
df_c = pd.read_pickle("./OUT_dfs/df_c1.pkl")

In [None]:
df_a.columns.values

In [None]:
def clean_dataframe(df, comp_or_appr):
    '''
    df: specify df_a of df_c and then "comp" or "appr"
    '''
    if comp_or_appr == "comp":
    
        df = df_c.copy()
        df_out = df[['UNIQUECOMPIDNEW','ADDRESS1', 'CITY','STATE', 'ZIPCODE', 'COUNTY','SALEDATE','APPRLATITUDE', 'APPRLONGITUDE',
                      'DESIGNSTYLE', 'HEATINGCOOLING', 'ENERGYEFF','GARAGECARPORT', 'PORCHPATIODECK',
                     'TOTALRM', 'BDRM', 'BLGRDTOTALSQFT', 'BLGRDFINISHSQFT','BLGRDRECRM', 'BLGRDBEDRM', 'BLGRDOTHERRM', 
                     'GROSSLIVINGAREA','SITEAREASQFT','ACTUALAGE', 'FULL_BATH','FULL_BLGRDBATHRM', 'HALF_BATH', 'HALF_BLGRDBATHRM',
                     'Number_of_stories_no_imputation','AT', 'DT', 'SD', 'DS_Bungalow', 'DS_Cabin', 'DS_Classical',
                       'DS_Colonial', 'DS_Cottage', 'DS_Contemp', 'DS_Conventional',
                       'DS_CapeCod', 'DS_Craftsman', 'DS_Duplex', 'DS_English',
                       'DS_French', 'DS_Georgian', 'DS_Medit', 'DS_MidCentury',
                       'DS_Mountain', 'DS_NeoEclect', 'DS_NewAmerican', 'DS_Patio',
                       'DS_Ranch', 'DS_Rambler', 'DS_SantaBarb', 'DS_Spanish',
                       'DS_SplitLevel', 'DS_SWest', 'DS_Territorial', 'DS_Townhouse',
                       'DS_Trad', 'DS_Tudor', 'DS_Tuscan', 'DS_TwoStory', 'DS_Victorian',
                     'LOCADJPARK', 'LOCADJPOWERLINE', 'LOCBUSYROAD', 'LOCCOMMERCIAL', 'LOCGOLFCOURSE',
                   'LOCINDUSTRIAL', 'LOCLANDFILL', 'LOCPUBLICTRAN', 'LOCRESIDENTIAL',
                     'VIEWTYPECITYSKYLINE',
                   'VIEWTYPECITYSTREET', 'VIEWTYPEGOLFCOURSE', 'VIEWTYPEINDUSTRIAL',
                   'VIEWTYPELIMITED', 'VIEWTYPEMOUNTAIN', 'VIEWTYPEPARK',
                   'VIEWTYPEPASTORAL', 'VIEWTYPEPOWERLINE', 'VIEWTYPERESIDENTIAL',
                   'VIEWTYPEWATER', 'VIEWTYPEWOOD'
                ]]
    else:
        df = df_a.copy()
        df_out = df[['SUBJ_APPR_ID','ADDRESS1', 'CITY','STATE', 'ZIPCODE', 'COUNTY','SALEDATE','APPRLATITUDE', 'APPRLONGITUDE',
                      'DESIGNSTYLE', 'HEATINGCOOLING', 'ENERGYEFF','GARAGECARPORT', 'PORCHPATIODECK',
                     '_COMP1_LAT', '_COMP1_LON',
                       '_COMP1_SALEDATE_x', '_COMP1_SALEDATE_y', '_COMP2_LAT', '_COMP2_LON',
                       '_COMP2_SALEDATE_x', '_COMP2_SALEDATE_y', '_COMP3_LAT', '_COMP3_LON',
                       '_COMP3_SALEDATE_x', '_COMP3_SALEDATE_y', '_COMP_1_DISTDAYS',
                       '_COMP_1_DISTKM', '_COMP_2_DISTDAYS', '_COMP_2_DISTKM',
                       '_COMP_3_DISTDAYS', '_COMP_3_DISTKM', '_COMP_DISTDAYS_AVG',
                       '_COMP_DISTKM_AVG', '_COMP_ID1', '_COMP_ID2', '_COMP_ID3',
                     'TOTALRM', 'BDRM', 'BLGRDTOTALSQFT', 'BLGRDFINISHSQFT','BLGRDRECRM', 'BLGRDBEDRM', 'BLGRDOTHERRM', 
                     'GROSSLIVINGAREA','SITEAREASQFT','ACTUALAGE', 'FULL_BATH','FULL_BLGRDBATHRM', 'HALF_BATH', 'HALF_BLGRDBATHRM',
                     'Number_of_stories_no_imputation','AT', 'DT', 'SD', 'DS_Bungalow', 'DS_Cabin', 'DS_Classical',
                       'DS_Colonial', 'DS_Cottage', 'DS_Contemp', 'DS_Conventional',
                       'DS_CapeCod', 'DS_Craftsman', 'DS_Duplex', 'DS_English',
                       'DS_French', 'DS_Georgian', 'DS_Medit', 'DS_MidCentury',
                       'DS_Mountain', 'DS_NeoEclect', 'DS_NewAmerican', 'DS_Patio',
                       'DS_Ranch', 'DS_Rambler', 'DS_SantaBarb', 'DS_Spanish',
                       'DS_SplitLevel', 'DS_SWest', 'DS_Territorial', 'DS_Townhouse',
                       'DS_Trad', 'DS_Tudor', 'DS_Tuscan', 'DS_TwoStory', 'DS_Victorian',
                     'LOCADJPARK', 'LOCADJPOWERLINE', 'LOCBUSYROAD', 'LOCCOMMERCIAL', 'LOCGOLFCOURSE',
                   'LOCINDUSTRIAL', 'LOCLANDFILL', 'LOCPUBLICTRAN', 'LOCRESIDENTIAL',
                     'VIEWTYPECITYSKYLINE',
                   'VIEWTYPECITYSTREET', 'VIEWTYPEGOLFCOURSE', 'VIEWTYPEINDUSTRIAL',
                   'VIEWTYPELIMITED', 'VIEWTYPEMOUNTAIN', 'VIEWTYPEPARK',
                   'VIEWTYPEPASTORAL', 'VIEWTYPEPOWERLINE', 'VIEWTYPERESIDENTIAL',
                   'VIEWTYPEWATER', 'VIEWTYPEWOOD'
                    ]]

    # merge condition into one column 1-6
    df["CONDITION_ALL"] = df[['CONDITIONC1', 'CONDITIONC2',
       'CONDITIONC3', 'CONDITIONC4', 'CONDITIONC5', 'CONDITIONC6']].idxmax(axis=1)

    df['CONDITION_ALL'] = df['CONDITION_ALL'].str.extract("([-+]?\d*\.\d+|[-+]?\d+)").astype(float)
    df_out['CONDITION_ALL'] = df['CONDITION_ALL']

    # merge quality into one column 1-6
    df["QUALITYOFCONST_ALL"] = df[['QUALITYOFCONSTQ1',
           'QUALITYOFCONSTQ2', 'QUALITYOFCONSTQ3', 'QUALITYOFCONSTQ4',
           'QUALITYOFCONSTQ5', 'QUALITYOFCONSTQ6']].idxmax(axis=1)

    df['QUALITYOFCONST_ALL'] = df['QUALITYOFCONST_ALL'].str.extract("([-+]?\d*\.\d+|[-+]?\d+)").astype(float)
    df_out['QUALITYOFCONST_ALL'] = df['QUALITYOFCONST_ALL']

    # merge view ratigns into 1 column 1-3

    df["VIEWRTG_ALL"] = df[['VIEWRTGNEUTRAL', 'VIEWRTGBENEFICIAL', 'VIEWRTGADVERSE']].idxmax(axis=1)
    df['VIEWRTG_ALL'] = df['VIEWRTG_ALL'].map({'VIEWRTGNEUTRAL':2, 'VIEWRTGBENEFICIAL':3, 'VIEWRTGADVERSE':1})
    #df["VIEWRTG_ALL"].hist()
    df_out['VIEWRTG_ALL'] = df['VIEWRTG_ALL']

     # merge location ratigns into 1 column 1-3
    df["LOCRTG_ALL"] = df[['LOCRTGNEUTRAL', 'LOCRTGBENEFICIAL','LOCRTGADVERSE']].idxmax(axis=1)
    df['LOCRTG_ALL'] = df['LOCRTG_ALL'].map({'LOCRTGNEUTRAL':2, 'LOCRTGBENEFICIAL':3, 'LOCRTGADVERSE':1})
    #df["LOCRTG_ALL"].hist(alpha=0.5)
    df_out['LOCRTG_ALL'] = df['LOCRTG_ALL']
    
    
    # remove any rows wiith incomlete data - create column groups here
    df["_LOC_TYPE_SUM"]= np.sum(df[['LOCRTGNEUTRAL', 'LOCRTGBENEFICIAL','LOCRTGADVERSE']],axis=1)
    df["_LOC_SUM"]= np.sum(df[['LOCRESIDENTIAL', 'LOCINDUSTRIAL', 'LOCCOMMERCIAL',
           'LOCBUSYROAD', 'LOCWATERFRONT', 'LOCGOLFCOURSE', 'LOCADJPARK',
           'LOCADJPOWERLINE', 'LOCLANDFILL', 'LOCPUBLICTRAN']],axis=1)
    df["_VIEW_TYPE_SUM"]= np.sum(df[['VIEWRTGNEUTRAL',
           'VIEWRTGBENEFICIAL', 'VIEWRTGADVERSE']],axis=1)
    df["_VIEW_SUM"]= np.sum(df[['VIEWTYPEWATER',
           'VIEWTYPEPASTORAL', 'VIEWTYPEWOOD', 'VIEWTYPEPARK',
           'VIEWTYPEGOLFCOURSE', 'VIEWTYPECITYSKYLINE', 'VIEWTYPEMOUNTAIN',
           'VIEWTYPERESIDENTIAL', 'VIEWTYPECITYSTREET', 'VIEWTYPEINDUSTRIAL',
           'VIEWTYPEPOWERLINE', 'VIEWTYPELIMITED']],axis=1)
    
    df2 = df[~df[["_DS_SUM","_LOC_TYPE_SUM","_LOC_SUM",
       "_VIEW_TYPE_SUM","_VIEW_SUM","Number_of_stories_no_imputation",'TOTALRM', 
       'BDRM','GROSSLIVINGAREA','FULL_BATH']].isin([0,0.0,np.nan]).any(axis=1)]
    df_out = df_out.loc[df_out.index.isin(df2.index)]
    return df_out

In [None]:
df_c2 = clean_dataframe(df_c,"comp")
df_a2 = clean_dataframe(df_a,"appr")

In [None]:
#filter comparables to remove incomplete records
print(df_c.shape, df_c2.shape)
unique_comps_from_appraisals = np.unique(df_a2[['_COMP_ID1','_COMP_ID2', '_COMP_ID3']].values)
df_c2a = df_c2[df_c2["UNIQUECOMPIDNEW"].isin(unique_comps_from_appraisals)]
df_c2a.shape

In [None]:
#filter out appraisals that do not have all 3 comps in comps dataframe 
print(df_a2.shape)
dfga = pd.DataFrame()
dfga[['_COMP_ID1','_COMP_ID2', '_COMP_ID3']] = df_a2[['_COMP_ID1','_COMP_ID2', '_COMP_ID3']]
#dfga["test1"] = df_a2[['_COMP_ID1','_COMP_ID2', '_COMP_ID3']].values.isin(df_c2["UNIQUECOMPIDNEW"])
my_list = np.unique(df_c2a[['UNIQUECOMPIDNEW']].values)
#dfga['multi'] = ['Y' if x == 3 else 'N' for x in np.sum(dfga.values >=0, 1)]

dfga["_ID_1_INC"] = dfga["_COMP_ID1"].isin(my_list)
dfga["_ID_2_INC"] = dfga["_COMP_ID2"].isin(my_list)
dfga["_ID_3_INC"] = dfga["_COMP_ID3"].isin(my_list)
df_a2 = df_a2.loc[dfga._ID_1_INC & dfga._ID_2_INC & dfga._ID_3_INC]
df_a2.shape

In [None]:
df_c2a.to_pickle("./OUT_dfs/df_c2.pkl")
df_a2.to_pickle("./OUT_dfs/df_a2.pkl")