In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os 
import sys
from datetime import datetime, timedelta
from modeling.utils import process_address
import warnings
import json 
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

In [None]:
target = "price"
dense_features = ["LON", 
                "LAT", 
                "building_sqft", 
                "Lot Size", 
                "Year Built", 
                "Garage Number", 
                "Bedrooms", 
                "Baths", 
                "Maintenance Fee", 
                "Tax Rate", 
                "Recent Market Value", 
                "Recent Tax Value"]
cate = ["status", "Property Type", "County", "Private Pool", "Area Pool"]
time_col = ["date"]
cate_multi = ["Foundation_multiclass", "Garage Types_multiclass", 
            "Roof Type_multiclass", "Pool_feature_multiclass", "floor_type_multiclass", 
            "finance_option_multiclass", "Exterior Type_multiclass", "Style_multiclass"]

add_numerial_features = ["elementary_school_star", "middle_school_star", "high_school_star"]
add_cate_multi = ["school_org"] # ["school_names", "school_grades", "school_org"]

df = pd.read_pickle("data/property_structured.pkl")
df.head(2)

Unnamed: 0,address,LON,LAT,building_sqft,Lot Size,Year Built,Garage Number,Bedrooms,Baths,Maintenance Fee,Tax Rate,Recent Market Value,Recent Tax Value,status,Property Type,County,Private Pool,Area Pool,date,Foundation_multiclass,Garage Types_multiclass,Roof Type_multiclass,Pool_feature_multiclass,floor_type_multiclass,finance_option_multiclass,Exterior Type_multiclass,Style_multiclass,price
57,"4513 Refugio Dr, Plano, TX 75024",-96.782005,33.102204,3395.0,429.0,2023.0,2.0,4.0,3.5,65.083333,1.864,157533.0,157533.0,Under Contract - P,single family,Collin County,No,,2023_11_05,[Slab],[Attached],[composition],,"[carpet, ceramic tile, wood]",,"[brick, rock/stone]",[traditional],789990.0
8,"1516 Bay Area Blvd P12, Houston, TX 77058",-95.114098,29.562197,684.0,31348.0,1977.0,,1.0,1.0,272.0,2.444,69024.0,47052.0,Under Contract - PS,townhouse/condo,Harris County,No,Yes,2023_11_05,[Slab],,[composition],,"[tile, vinyl]","[cash, conventional, fha, investor, va]","[brick, stucco]",[traditional],79900.0


In [6]:
raw_df = pd.read_pickle("data/contracted_houses_11052023_11062024.pkl")
raw_df.head(3)

Unnamed: 0,address,harlink,mp_features,agent,price,posted_days,status,zipcode,num_beds,num_bath,num_half_bath,building_sqft,has_loft,num_stories,num_parking_space,address_key,date,json_path,time_epoch,image_path,json_size,image_cnt
154,"1120 Skyline Dr, Wimberley, TX 78676",https://www.har.com//homedetail/1120-skyline-d...,"4 bedrooms 2,454 Sqft. ($326/Sqft.) 2 full & 1...",Jeff Coffman Coffman Real Estate,799900.0,118.0,Under Contract - OP,78676,4.0,0.0,0.0,2454.0,0.0,0.0,0.0,"1120 Skyline Dr, Wimberley, TX 78676",2024_01_19,/home/user/DataCenter/HAR_data/address_fullinf...,1718768786,/home/user/DataCenter/HAR_data/house_images/00...,156,0
397,"100 Cedar Hill St, Georgetown, TX 78628",https://www.har.com//homedetail/100-cedar-hill...,"4 bedrooms 2,813 Sqft. ($208/Sqft.) 2 full & 1...",Kim Whitlock Keller Williams Realty-RR WC,585000.0,96.0,Under Contract - P,78628,4.0,0.0,0.0,2813.0,0.0,0.0,0.0,"100 Cedar Hill St, Georgetown, TX 78628",2024_01_31,/home/user/DataCenter/HAR_data/address_fullinf...,1720237951,/home/user/DataCenter/HAR_data/house_images/00...,156,0
45,"14610 Becurtesy Ct, Cypress, TX 77429",https://www.har.com//homedetail/14610-becurtes...,"3 bedrooms 1,568 Sqft. ($188/Sqft.) 2 full bat...",Michael Glenny Roofline Realtors,294900.0,19.0,Under Contract - OP,77429,3.0,0.0,0.0,1568.0,0.0,0.0,0.0,"14610 Becurtesy Ct, Cypress, TX 77429",2024_02_20,/home/user/DataCenter/HAR_data/address_fullinf...,1722136513,/home/user/DataCenter/HAR_data/house_images/00...,156,0


In [254]:
def read_json(json_path):
    with open(json_path) as f:
        return json.load(f)

sample_df = raw_df.sample(5000)
sample_df["content"] = sample_df["json_path"].apply(read_json)

In [277]:
def get_lon_lat(lon_str, lat_str):
    try:
        return float(lon_str), float(lat_str)
    except:
        return np.nan, np.nan

def parse_built_Sqft(content):
    try: 
        building_sqft = content["house_features"]["Building Sqft.:"]
    except:
        building_sqft = None
    if building_sqft is None:
        return np.nan
    built_sqft = str(building_sqft)
    try:
        built_sqft = built_sqft.split("(m²)")[0]
        built_sqft_, built_sqm_ = built_sqft[:-3], built_sqft[-3:]
        built_sqft_ = float(built_sqft_.replace(",", ""))
        built_sqm_ = float(built_sqm_)
        built_sqft_to_sqm = built_sqft_ * 0.092903 
        if abs(built_sqft_to_sqm - built_sqm_) / built_sqm_ < 0.05:
            return built_sqft_
        else:
            built_sqft_, built_sqm_ = built_sqft[:-2], built_sqft[-2:]
            built_sqft_ = float(built_sqft_.replace(",", ""))
            built_sqm_ = float(built_sqm_)
            built_sqft_to_sqm = built_sqft_ * 0.092903 
            if abs(built_sqft_to_sqm - built_sqm_) / built_sqm_ < 0.05:
                return built_sqft_
            else:
                return np.nan
    except Exception:
        return np.nan
    

def parse_ls1(ls1):
    ls1 = str(ls1)
    if ls1 == "nan":
        return np.nan
    if "Sqft." in ls1:
        ls_ = float(ls1.split("Sqft.")[1].replace("(m²)", "").replace(",", ""))
    if "Acres" in ls1:
        ls_ = float(ls1.split("Acres")[1].replace("(m²)", "").replace(",", ""))
    return ls_


def parse_lot_size(content):
    try: 
        lot_size = content["house_features"]["Lot Size:"]
    except:
        lot_size = "nan"
    lot_size = str(lot_size)
    if lot_size == "nan":
        return np.nan, ""
    ls1, ls2 = lot_size.split("/")
    lot_size_m2 = parse_ls1(ls1)
    return lot_size_m2, ls2


def parse_built_year(content):
    try: 
        built_year = str(content["house_features"]["Year Built:"])
    except:
        built_year = "nan"

    if built_year == "nan":
        return np.nan, np.nan
    year, source = built_year.split("/")
    year = int(year)
    if year == 20119:
        year = 2019
    return year, source.strip()

def get_garage_num(content):
    try:
        garage_string = content["house_features"]["Garage(s):"]
    except:
        garage_string = None
    if garage_string is None:
        return np.nan
    
    try:
        garage_num = float(garage_string.split("/")[0]) if "/" in garage_string else 0
        return garage_num
    except:
        return np.nan

def parse_bedroom(content):
    try:
        bedrooms = content["house_features"]["Bedrooms:"]
    except:
        bedrooms = np.nan  
    bedrooms = str(bedrooms)
    if bedrooms == "nan":
        return np.nan
    if "-" in bedrooms:
        bedrooms = [float(tem_) for tem_ in bedrooms.replace("Bedroom(s)", "").split("-")]
        if bedrooms[1] > 100:
            return bedrooms[0]
        else:
            return np.mean(bedrooms)
    else:
        return float(bedrooms.replace("Bedroom(s)", ""))

def parse_bath(content):

    try:
        baths = content["house_features"]["Baths:"]
    except:
        baths = np.nan  

    baths = str(baths)
    if baths == "nan":
        return np.nan
    baths = baths.replace(" Bath(s)", "")
    num_half, num_full = 0, 0
    if "Half" in baths:
        baths = baths.replace("Half", "").replace("Full", "")
        num_full, num_half = baths.split("&")
    else:
        num_half = 0
        num_full = baths.replace("Full", "")
    return float(num_full) + float(num_half) / 2.0 

def parse_maintenance_fee(content):
    try:
        mstr = content["house_features"]["Maintenance Fee:"]
    except:
        mstr = np.nan 
    mstr = str(mstr)
    mfee = np.nan
    if "No".lower() in mstr.lower():
        mfee = 0
    if "Yes".lower() in mstr.lower() and "/" not in mstr:
        mfee = np.nan
    if "/" in mstr.lower() and "Annually".lower() in mstr.lower() and "$" in mstr:
        mfee = float(mstr.split("/")[-2].replace("$", "").replace(",", "")) / 12
    
    if "/" in mstr.lower() and "Month".lower() in mstr.lower() and "$" in mstr:
        mfee = float(mstr.split("/")[-2].replace("$", "").replace(",", ""))
    
    if "/" in mstr.lower() and "Quarter".lower() in mstr.lower() and "$" in mstr:
        mfee = float(mstr.split("/")[-2].replace("$", "").replace(",", "")) / 3
    if mfee > 5000:
        mfee = np.nan
    return mfee


def extract_tax_rate_tax_table(tax_table):
    if len(tax_table) == 0:
        return np.nan
    for element_ in tax_table:
        for inner_element_ in element_:
            if "Total Tax Rate".lower() in inner_element_[0].lower():
                return round(float(inner_element_[1].replace("%", "").strip()), 3)
    return np.nan

def get_tax_rate(content):
    if "Tax Rate:" in content["house_features"]:
        return float(content["house_features"]["Tax Rate:"])
    else:
        return np.nan 

def get_recent_market_value(content):
    try:
        table1 = content["house_tax_table"][0]
        table_df = pd.DataFrame(table1[1:], columns=table1[0])
        recent_market_value = table_df.sort_values("Tax Year", ascending=False).iloc[0]["Market Value"]
        recent_market_value = eval(recent_market_value.replace("$", "").replace(",", ""))
        return recent_market_value
    except:
        return np.nan

def get_recent_tax_value(content):
    try:
        table1 = content["house_tax_table"][0]
        table_df = pd.DataFrame(table1[1:], columns=table1[0])
        recent_tax_value = table_df.sort_values("Tax Year", ascending=False).iloc[0]["Tax Assessment"]
        recent_tax_value = eval(recent_tax_value.replace("$", "").replace(",", ""))
        return recent_tax_value
    except:
        return np.nan


def parse_property_type(content):
    try: 
        property_type = content["house_features"]["Property Type:"]
    except:
        return None 

    property_type = str(property_type)
    if property_type == "Country Homes/Acreage - Free Standi":
        property_type = "Country Homes/Acreage"
    if "Multi-Family" in property_type:
        property_type = "Multi-Family"
    if "Single Family" in property_type:
        property_type = "Single Family"
    if "Single-Family" in property_type:
        property_type = "Single Family"
    if "townhouse" in property_type.lower() and "condo" in property_type.lower():
        property_type = "townhouse/condo"
    if "residential" in property_type.lower() and "mobile" in property_type.lower():
        property_type = "residential/mobile home"
    if "residential" in property_type.lower() and "condo" in property_type.lower():
        property_type = "residential/condo"
    if "residential" in property_type.lower() and "manufactured" in property_type.lower():
        property_type = "residential/manufactured"
    if "residential" in property_type.lower() and "townhouse" in property_type.lower():
        property_type = "townhouse"
    if "residential" in property_type.lower() and "lot" in property_type.lower():
        property_type = "residential/lot"
    if "country homes/acreage" in property_type.lower():
        property_type = "country homes/acreage"
    return property_type.lower()

def private_pool_feature(content):
    house_features = content["house_features"]
    if "Private Pool:" in house_features:
        return house_features["Private Pool:"]
    else:
        return None
    
def area_pool_feature(content):
    house_features = content["house_features"]
    if "Area Pool:" in house_features:
        return house_features["Area Pool:"]
    else:
        return None

sample_df["LAT"], sample_df["LON"] = zip(*sample_df["content"].apply(lambda x: get_lon_lat(x["coords"][0], x["coords"][1])))
sample_df["building_sqft2"] = sample_df["content"].apply(lambda x: parse_built_Sqft(x))
sample_df["building_sqft"] = sample_df.apply(lambda x: x["building_sqft2"] if np.isnan(x["building_sqft2"]) is False else x["building_sqft"], axis=1)
sample_df["Lot Size"] = sample_df["content"].apply(lambda x: parse_lot_size(x)[0])
sample_df["Year Built"] = sample_df["content"].apply(lambda x: parse_built_year(x)[0])
sample_df["Garage Number"] = sample_df["content"].apply(lambda x: get_garage_num(x))
sample_df["Bedrooms"] = sample_df["content"].apply(lambda x: parse_bedroom(x))
sample_df["Baths"] = sample_df["content"].apply(lambda x: parse_bath(x))
sample_df["Maintenance Fee"] = sample_df["content"].apply(lambda x: parse_maintenance_fee(x))
sample_df["Tax Rate"] = sample_df["content"].apply(lambda x: extract_tax_rate_tax_table(x["house_tax_table"]))
sample_df["tax_rate2"] = sample_df["content"].apply(get_tax_rate)
sample_df.loc[sample_df["Tax Rate"].isnull(), "Tax Rate"] = sample_df.loc[sample_df["Tax Rate"].isnull(), "tax_rate2"]
sample_df["Recent Market Value"] = sample_df["content"].apply(lambda x: get_recent_market_value(x))
sample_df["Recent Tax Value"] = sample_df["content"].apply(lambda x: get_recent_tax_value(x))
sample_df["Property Type"] = sample_df["content"].apply(lambda x: parse_property_type(x))
sample_df["County"] = sample_df["content"].apply(lambda x: x["house_features"]["County:"] if "County:" in x["house_features"] else None)
sample_df["Private Pool"] = sample_df["content"].apply(lambda x: private_pool_feature(x))
sample_df["Area Pool"] = sample_df["content"].apply(lambda x: area_pool_feature(x))

In [276]:
def area_pool_feature(content):
    house_features = content["house_features"]
    if "Area Pool:" in house_features:
        return house_features["Area Pool:"]
    else:
        return None
sample_df["content"].apply(lambda x: area_pool_feature(x)).unique()

array(['No', None, 'Yes'], dtype=object)

In [270]:
#
test_df = sample_df.join(df.set_index("address"), on="address", how="inner", rsuffix="_r")

In [271]:
col = "County"
filter = test_df.apply(lambda x: "%s"%(x[col])  != "%s"%(x[col + "_r"]), axis=1)
filter.sum(), len(test_df)

(0, 4344)

In [264]:
# test_df["tem"] = test_df["content"].apply(lambda x: x["house_features"]["Year Built:"] if "Year Built:" in x["house_features"] else None)
test_df[filter][[col, col + "_r"]].sample(5)


Unnamed: 0,Property Type,Property Type_r
262,townhouse,townhouse/condo
178,residential/condo,townhouse/condo
247,residential/condo,townhouse/condo
61,townhouse,townhouse/condo
141,residential/mobile home,residential/mobile


In [266]:
all_house_features = set()
for feat in sample_df["content"].apply(lambda x: x["house_features"]):
    all_house_features.update(feat.keys())
len(all_house_features)

163

In [274]:
[_ for _ in all_house_features if "pool" in _.lower()]

['Area Pool:', 'Private Pool Desc:', 'Private Pool:']