In [43]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os 
import sys
from datetime import datetime, timedelta
from modeling.utils import process_address
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

## Select the time window to extract house information

In [51]:
daily_scrpe_folder = '/home/user/DataCenter/HAR_data/combined_pickles/'
start_date = datetime(2023, 11, 5)
end_date = datetime(2024, 11,6)
daily_df = None 

for day in tqdm(range((end_date - start_date).days + 1)):
    cur_date = start_date + timedelta(days=day)
    date_str = cur_date.strftime("%Y_%m_%d")
    daily_pickle = os.path.join(daily_scrpe_folder, "zipcode_search__"+date_str+"_for_sale.pkl")
    try:
        if daily_df is None:
            daily_df = pd.read_pickle(daily_pickle)
            daily_df = daily_df[daily_df["status"].apply(lambda x: "contract" in x.lower())]
            daily_df["address_key"] = daily_df["address"].apply(lambda x: process_address(x))
            daily_df["date"] = date_str
        else:
            tem_df = pd.read_pickle(daily_pickle)
            tem_df = tem_df[tem_df["status"].apply(lambda x: "contract" in x.lower())]
            tem_df["address_key"] = tem_df["address"].apply(lambda x: process_address(x))
            tem_df["date"] = date_str
            daily_df = pd.concat([daily_df, tem_df], axis=0)\
                        .drop_duplicates("address", keep="last")
    except:
        pass 

  0%|                                                                                                                                                    | 0/368 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 368/368 [04:02<00:00,  1.52it/s]


In [52]:
print(len(daily_df))
daily_df.head(2)

260434


Unnamed: 0,address,harlink,mp_features,agent,price,posted_days,status,zipcode,num_beds,num_bath,num_half_bath,building_sqft,has_loft,num_stories,num_parking_space,address_key,date
57,"4513 Refugio Dr, Plano, TX 75024",https://www.har.com//homedetail/4513-refugio-d...,"4 bedrooms 3,395 Sqft. ($233/Sqft.) 3 full & 1...",Karla Davis Fathom Realty,789990.0,66.0,Under Contract - P,75024,4.0,0.0,0.0,3395.0,0.0,0.0,0.0,"4513 Refugio Dr, Plano, TX 75024",2023_11_05
8,"1516 Bay Area Blvd P12, Houston, TX 77058",https://www.har.com//homedetail/1516-bay-area-...,1 bedrooms 684 Sqft. ($117/Sqft.) 1 full baths...,Laurie Long RE/MAX Space Center,79900.0,79.0,Under Contract - PS,77058,1.0,0.0,0.0,684.0,0.0,0.0,0.0,"1516 Bay Area Blvd P12, Houston, TX 77058",2023_11_05


## Append full house json file 

In [62]:
json_folder = "/home/user/DataCenter/HAR_data/address_fullinfo_for_sale/"
all_files = [os.path.join(json_folder, f) for f in os.listdir(json_folder) if f.endswith(".json")]
json_path_df = pd.DataFrame(all_files, columns=["json_path"])
json_path_df["address"] = json_path_df["json_path"].apply(lambda x: x.split("/")[-1][:-16])
json_path_df["time_epoch"] = json_path_df["json_path"].apply(lambda x: x.split("/")[-1][-15:-5])
print(len(json_path_df))
json_path_df = json_path_df.sort_values("time_epoch").drop_duplicates("address", keep="last")
print(len(json_path_df))
json_path_df.sample(3)

1687741
1322213


Unnamed: 0,json_path,address,time_epoch
1420123,/home/user/DataCenter/HAR_data/address_fullinf...,"1649 Pinot Noir St, Leander, TX 78641",1628900603
686945,/home/user/DataCenter/HAR_data/address_fullinf...,"1441 Isabella Ln, Aubrey, TX 76227",1710390001
83461,/home/user/DataCenter/HAR_data/address_fullinf...,"8611 Datapoint Dr #47, San Antonio, TX 78229",1656798632


In [65]:
join_df = daily_df.join(json_path_df.set_index("address"), on="address", how="inner")
join_df.head(2)

Unnamed: 0,address,harlink,mp_features,agent,price,posted_days,status,zipcode,num_beds,num_bath,num_half_bath,building_sqft,has_loft,num_stories,num_parking_space,address_key,date,json_path,time_epoch
57,"4513 Refugio Dr, Plano, TX 75024",https://www.har.com//homedetail/4513-refugio-d...,"4 bedrooms 3,395 Sqft. ($233/Sqft.) 3 full & 1...",Karla Davis Fathom Realty,789990.0,66.0,Under Contract - P,75024,4.0,0.0,0.0,3395.0,0.0,0.0,0.0,"4513 Refugio Dr, Plano, TX 75024",2023_11_05,/home/user/DataCenter/HAR_data/address_fullinf...,1690411643
8,"1516 Bay Area Blvd P12, Houston, TX 77058",https://www.har.com//homedetail/1516-bay-area-...,1 bedrooms 684 Sqft. ($117/Sqft.) 1 full baths...,Laurie Long RE/MAX Space Center,79900.0,79.0,Under Contract - PS,77058,1.0,0.0,0.0,684.0,0.0,0.0,0.0,"1516 Bay Area Blvd P12, Houston, TX 77058",2023_11_05,/home/user/DataCenter/HAR_data/address_fullinf...,1689375459


## Append Image Folder 

In [73]:
image_folder = "/home/user/DataCenter/HAR_data/house_images/"
all_folders = []
for i in tqdm(range(100)):
    for j in range(100):
        folder = os.path.join(image_folder, f"{i:04d}/{j:04d}") 
        all_folders.extend([os.path.join(folder, f) for f in os.listdir(folder)])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 19.54it/s]


In [96]:
image_df = pd.DataFrame(all_folders, columns=["image_path"])
image_df["address"] = image_df["image_path"].apply(lambda x: x.split("/")[-1][:-11])
image_df["time_epoch"] = image_df["image_path"].apply(lambda x: x.split("/")[-1][-10:])
print(len(image_df))
image_df = image_df.sort_values("time_epoch").drop_duplicates("address", keep="last")
print(len(image_df))
image_df.sample(2)

608920
606158


Unnamed: 0,image_path,address,time_epoch
378087,/home/user/DataCenter/HAR_data/house_images/00...,"5635 Fountainwood Dr, San Antonio, TX 78233",1733507515
382474,/home/user/DataCenter/HAR_data/house_images/00...,"12720 Apollo Creek, San Antonio, TX 78245",1730929530


In [99]:
print(len(join_df))
join_df2 = join_df.join(image_df.set_index("address")[["image_path"]], on="address", how="inner")
print(len(join_df2))
join_df2.head(2)

260368
258960


Unnamed: 0,address,harlink,mp_features,agent,price,posted_days,status,zipcode,num_beds,num_bath,num_half_bath,building_sqft,has_loft,num_stories,num_parking_space,address_key,date,json_path,time_epoch,image_path
57,"4513 Refugio Dr, Plano, TX 75024",https://www.har.com//homedetail/4513-refugio-d...,"4 bedrooms 3,395 Sqft. ($233/Sqft.) 3 full & 1...",Karla Davis Fathom Realty,789990.0,66.0,Under Contract - P,75024,4.0,0.0,0.0,3395.0,0.0,0.0,0.0,"4513 Refugio Dr, Plano, TX 75024",2023_11_05,/home/user/DataCenter/HAR_data/address_fullinf...,1690411643,/home/user/DataCenter/HAR_data/house_images/00...
8,"1516 Bay Area Blvd P12, Houston, TX 77058",https://www.har.com//homedetail/1516-bay-area-...,1 bedrooms 684 Sqft. ($117/Sqft.) 1 full baths...,Laurie Long RE/MAX Space Center,79900.0,79.0,Under Contract - PS,77058,1.0,0.0,0.0,684.0,0.0,0.0,0.0,"1516 Bay Area Blvd P12, Houston, TX 77058",2023_11_05,/home/user/DataCenter/HAR_data/address_fullinf...,1689375459,/home/user/DataCenter/HAR_data/house_images/00...


In [104]:
## Strategy to drop the redudant addresses, chooose the one with largest json size, and the most images in the folder
join_df2["json_size"] = join_df2["json_path"].apply(lambda x: os.path.getsize(x))

In [105]:
join_df2["image_cnt"] = join_df2["image_path"].apply(lambda x: len([_ for _ in os.listdir(x) if _.endswith(".jpeg")]))

In [124]:
## Defer the drop duplicates to the end
# final_df = join_df2.sort_values(["json_size", "image_cnt"]).drop_duplicates("address_key", keep="last")
# print(len(final_df), len(join_df2))

In [125]:
# final_df.head(2)

In [126]:
join_df2.to_pickle("data/contracted_houses_11052023_11062024.pkl")

In [123]:
# join_df2["json_size"].min(), join_df2["json_size"].max(), join_df2["json_size"].mean()

In [122]:
# join_df2[join_df2["json_size"] == join_df2["json_size"].min()]