In [1]:
from typing import Any, Dict, List
import pandas as pd
from torch import Tensor
from torch.nn import Linear, Module, ModuleList
import numpy as np
import random
from tqdm import tqdm

import torch
import torch_frame
from torch_frame.data import Dataset
from torch_frame.data import DataLoader
from torch_frame import TensorFrame, stype
from torch_frame.data.stats import StatType
from torch_frame.nn.conv import TabTransformerConv, FTTransformerConvs
from torch_frame.nn.encoder import (
    EmbeddingEncoder,
    LinearEncoder,
    StypeWiseFeatureEncoder,
    MultiCategoricalEmbeddingEncoder,
    TimestampEncoder,
)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [4]:
target = "price"
dense_features = ["LON", 
                  "LAT", 
                  "building_sqft", 
                  "Lot Size", 
                  "Year Built", 
                  "Garage Number", 
                  "Bedrooms", 
                  "Baths", 
                  "Maintenance Fee", 
                  "Tax Rate", 
                  "Recent Market Value", "Recent Tax Value"]
cate = ["status", "Property Type", "County", "Private Pool", "Area Pool", "address"]
time_col = ["date"]
cate_multi = ["Foundation_multiclass", "Garage Types_multiclass", 
              "Roof Type_multiclass", "Pool_feature_multiclass", "floor_type_multiclass", 
              "finance_option_multiclass", "Exterior Type_multiclass", "Style_multiclass"]

raw_df = pd.read_pickle("../data/property_structured_v2.pkl").reset_index(drop=True)
raw_df["date"] = raw_df["date"].apply(lambda x: x.replace("_", "-"))
raw_df["date"] = pd.to_datetime(raw_df["date"])
raw_df.head(3)

Unnamed: 0,address,LON,LAT,building_sqft,Lot Size,Year Built,Garage Number,Bedrooms,Baths,Maintenance Fee,Tax Rate,Recent Market Value,Recent Tax Value,status,Property Type,County,Private Pool,Area Pool,date,Foundation_multiclass,Garage Types_multiclass,Roof Type_multiclass,Pool_feature_multiclass,floor_type_multiclass,finance_option_multiclass,Exterior Type_multiclass,Style_multiclass,price,elementary_school_star,middle_school_star,high_school_star,school_names,school_grades,school_org
0,"4513 Refugio Dr, Plano, TX 75024",-96.782005,33.102204,3395.0,429.0,2023.0,2.0,4.0,3.5,65.083333,1.864,157533.0,157533.0,Under Contract - P,single family,Collin County,No,,2023-11-05,[Slab],[Attached],[composition],,"[carpet, ceramic tile, wood]",,"[brick, rock/stone]",[traditional],789990.0,5.0,5.0,,"[Borchardt Elementary School Elementary, Fowle...","[KG - 05, 06 - 08]",[FRISCO ISD]
1,"1516 Bay Area Blvd P12, Houston, TX 77058",-95.114098,29.562197,684.0,31348.0,1977.0,,1.0,1.0,272.0,2.444,69024.0,47052.0,Under Contract - PS,townhouse/condo,Harris County,No,Yes,2023-11-05,[Slab],,[composition],,"[tile, vinyl]","[cash, conventional, fha, investor, va]","[brick, stucco]",[traditional],79900.0,4.0,,5.0,"[Falcon Pass Elementary School Elementary, Spa...","[EE - 05, 06 - 08, 09 - 12]",[CLEAR CREEK ISD]
2,"1717 San Sebastian Ln, Houston, TX 77058",-95.090845,29.544959,1280.0,12058.0,1975.0,,2.0,2.5,530.0,2.299,168461.0,168461.0,Under Contract - P,townhouse/condo,Harris County,No,Yes,2023-11-05,[Slab],,[composition],,"[carpet, laminate]","[cash, conventional, fha, va]","[brick, cement board, wood]","[other, traditional]",189000.0,,,4.0,"[G W Robinson Elementary, Space Center Interme...","[KG - 05, 06 - 08, 09 - 12]",[CLEAR CREEK ISD]


In [5]:
len(raw_df)

219217

In [6]:
raw_df = raw_df[dense_features + cate + cate_multi + time_col + [target]]

for col in cate_multi:
    raw_df[col] = raw_df[col].apply(lambda d: d if isinstance(d, list) else [])

col_to_stype = {}
col_to_stype = {d: stype.numerical for d in dense_features}
col_to_stype.update({target: stype.numerical})
col_to_stype.update({d: stype.timestamp for d in time_col})
col_to_stype.update({d: stype.categorical for d in cate})
col_to_stype.update({d: stype.multicategorical for d in cate_multi})

In [7]:
len(raw_df.columns), len(col_to_stype), col_to_stype

(28,
 28,
 {'LON': <stype.numerical: 'numerical'>,
  'LAT': <stype.numerical: 'numerical'>,
  'building_sqft': <stype.numerical: 'numerical'>,
  'Lot Size': <stype.numerical: 'numerical'>,
  'Year Built': <stype.numerical: 'numerical'>,
  'Garage Number': <stype.numerical: 'numerical'>,
  'Bedrooms': <stype.numerical: 'numerical'>,
  'Baths': <stype.numerical: 'numerical'>,
  'Maintenance Fee': <stype.numerical: 'numerical'>,
  'Tax Rate': <stype.numerical: 'numerical'>,
  'Recent Market Value': <stype.numerical: 'numerical'>,
  'Recent Tax Value': <stype.numerical: 'numerical'>,
  'price': <stype.numerical: 'numerical'>,
  'date': <stype.timestamp: 'timestamp'>,
  'status': <stype.categorical: 'categorical'>,
  'Property Type': <stype.categorical: 'categorical'>,
  'County': <stype.categorical: 'categorical'>,
  'Private Pool': <stype.categorical: 'categorical'>,
  'Area Pool': <stype.categorical: 'categorical'>,
  'address': <stype.categorical: 'categorical'>,
  'Foundation_multiclas

In [8]:
dataset = Dataset(
    raw_df, 
    col_to_stype=col_to_stype,
    target_col="price"
)
dataset.materialize()

Dataset()

In [9]:
torch.manual_seed(10)
dataset.shuffle()

Dataset()

In [10]:
train_dataset, test_dataset = dataset[:0.8], dataset[0.80:]
train_tf = train_dataset.tensor_frame
train_tf.device

device(type='cpu')

In [11]:
test_dataset.tensor_frame

TensorFrame(
  num_cols=27,
  num_rows=43843,
  numerical (12): ['Baths', 'Bedrooms', 'Garage Number', 'LAT', 'LON', 'Lot Size', 'Maintenance Fee', 'Recent Market Value', 'Recent Tax Value', 'Tax Rate', 'Year Built', 'building_sqft'],
  timestamp (1): ['date'],
  categorical (6): ['Area Pool', 'County', 'Private Pool', 'Property Type', 'address', 'status'],
  multicategorical (8): ['Exterior Type_multiclass', 'Foundation_multiclass', 'Garage Types_multiclass', 'Pool_feature_multiclass', 'Roof Type_multiclass', 'Style_multiclass', 'finance_option_multiclass', 'floor_type_multiclass'],
  has_target=True,
  device='cpu',
)

In [14]:
address_tensor = test_dataset.tensor_frame.feat_dict[stype.categorical][:, 4]

In [15]:
len(test_dataset)

43843

In [16]:
test_addresses = test_dataset._to_tensor_frame_converter._get_mapper("address").backward(address_tensor)
np.save("../data/val_address.npy", test_addresses.values)

In [164]:
# test_dataset.tensor_frame.feat_dict[stype.numerical][0:10]

In [198]:
raw_df = pd.read_pickle("../data/property_structured.pkl").reset_index(drop=True)
raw_df["date"] = raw_df["date"].apply(lambda x: x.replace("_", "-"))
raw_df["date"] = pd.to_datetime(raw_df["date"])

In [199]:
def generate_key1(x):
    return ",".join(["%0.4f"%(_) for _ in x])
raw_df["key1"] = raw_df[dense_features].apply(lambda x: generate_key1(x.values), axis=1)

In [200]:
def generate_key2(x):
    return ",".join([_ if _ is not None else "None" for _ in x])
raw_df["key2"] = raw_df[cate].apply(lambda x: generate_key2(x.values), axis=1)

In [201]:
def generate_key3(x):
    return ",".join([",".join(_) if _ is not None else "None" for _ in x])
raw_df["key3"] = raw_df[cate_multi].apply(lambda x: generate_key3(x.values), axis=1)

In [202]:
raw_df["key"] = raw_df.apply(lambda x: x["key1"] + x["key2"] + x["key3"], axis=1)

In [204]:
tem = raw_df.groupby("key").count()["LAT"]
print(np.sum(tem[tem > 1].sort_values().values))
tem[tem > 1].sort_values()

# key = "-84.2700,37.8393,5076.0000,1267.0000,2024.0000,3.0000,5.0000,5.5000,102.5000,3.5200,nan,nanUnder Contract - P,single family,Harris County,No,YesSlab,Attached,Oversized,Tandem,composition,None,carpet,tile,cash,conventional,brick,other,traditional"
# raw_df[raw_df["key"] == key]["address"].values

0


Series([], Name: LAT, dtype: int64)

In [192]:
def process_address(b):
    return ", ".join([_.strip(" ") for _ in b.split("   ") if len(_.strip(" ")) > 0])
raw_df["address3"] = raw_df["address"].apply(lambda x: process_address(x))

In [193]:
raw_df["address3"].nunique(), len(raw_df)

(219217, 220306)

In [191]:
a = ' 22206 Kentucky Blue Grass Ln, Cypress, TX 77433,  '
b = ' 22206 Kentucky Blue Grass Ln                            Cypress, TX 77433 '
a.strip(" ,"), ", ".join([_.strip(" ") for _ in b.split("   ") if len(_.strip(" ")) > 0])

('22206 Kentucky Blue Grass Ln, Cypress, TX 77433',
 '22206 Kentucky Blue Grass Ln, Cypress, TX 77433')

In [139]:
' 22206 Kentucky Blue Grass Ln, Cypress, TX 77433,  '.strip(" ,")

'22206 Kentucky Blue Grass Ln, Cypress, TX 77433'

In [31]:
train_tf = train_dataset.tensor_frame
train_tf[0].feat_dict

{<stype.numerical: 'numerical'>: tensor([[ 3.5000e+00,  4.0000e+00,  2.0000e+00,  3.3102e+01, -9.6782e+01,
           4.2900e+02,  6.5083e+01,  1.5753e+05,  1.5753e+05,  1.8640e+00,
           2.0230e+03,  3.3950e+03]]),
 <stype.categorical: 'categorical'>: tensor([[-1,  3,  0,  0,  0]]),
 <stype.multicategorical: 'multicategorical'>: MultiNestedTensor(num_rows=1, num_cols=8, device='cpu')}

In [32]:
len(train_dataset), len(test_dataset)

(193522, 48381)

In [33]:
stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical: LinearEncoder(),
    stype.multicategorical: MultiCategoricalEmbeddingEncoder(),
}

In [57]:
train_loader = DataLoader(train_dataset.tensor_frame, batch_size=1024,
                        shuffle=True)
test_loader = DataLoader(test_dataset.tensor_frame, batch_size=2048)

In [61]:
tf = next(iter(train_loader))
train_dataset.tensor_frame.device, tf.device

(device(type='cuda', index=0), device(type='cuda', index=0))

In [35]:
class ExampleTransformer2(Module):
    def __init__(
        self,
        channels: int,
        out_channels: int,
        num_layers: int,
        num_heads: int,
        col_stats: Dict[str, Dict[StatType, Any]],
        col_names_dict: Dict[torch_frame.stype, List[str]],
    ):
        super().__init__()
        self.encoder = StypeWiseFeatureEncoder(
            out_channels=channels,
            col_stats=col_stats,
            col_names_dict=col_names_dict,
            stype_encoder_dict={
                stype.categorical: EmbeddingEncoder(),
                stype.numerical: LinearEncoder(), 
                stype.multicategorical: MultiCategoricalEmbeddingEncoder(),
            },
        )
        
        self.ftt_transformer_conv = FTTransformerConvs(channels=channels,
                                                       num_layers=num_layers,
                                                       nhead=num_heads)
        self.decoder = Linear(channels, out_channels)

    def forward(self, tf: TensorFrame) -> Tensor:
        x, _ = self.encoder(tf)
        _, x = self.ftt_transformer_conv(x)
        out = self.decoder(x).squeeze()
        return out

In [36]:
class VanillaTransformer(Module):
    def __init__(
        self,
        channels: int,
        out_channels: int,
        num_layers: int,
        num_heads: int,
        col_stats: Dict[str, Dict[StatType, Any]],
        col_names_dict: Dict[torch_frame.stype, List[str]],
    ):
        super().__init__()
        self.encoder = StypeWiseFeatureEncoder(
            out_channels=channels,
            col_stats=col_stats,
            col_names_dict=col_names_dict,
            stype_encoder_dict={
                stype.categorical: EmbeddingEncoder(),
                stype.numerical: LinearEncoder(), 
                stype.multicategorical: MultiCategoricalEmbeddingEncoder(),
            },
        )
        self.tab_transformer_convs = ModuleList([
            TabTransformerConv(
                channels=channels,
                num_heads=num_heads,
            ) for _ in range(num_layers)
        ])
        self.decoder = Linear(channels, out_channels)

    def forward(self, tf: TensorFrame) -> Tensor:
        x, _ = self.encoder(tf)
        for tab_transformer_conv in self.tab_transformer_convs:
            x = tab_transformer_conv(x)
        out = self.decoder(x.mean(dim=1))
        return out

In [78]:
import torch
import torch.nn.functional as F
from torch.nn import MSELoss, L1Loss

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = VanillaTransformer(
    channels=256,
    out_channels=1,
    num_layers=4,
    num_heads=4,
    col_stats=train_dataset.col_stats,
    col_names_dict=train_dataset.tensor_frame.col_names_dict,
).to(device)

lr = 0.0001

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
mse_loss = MSELoss()
mae_loss = L1Loss()

for epoch in range(200):
    error = []
    for tf in iter(train_loader):
        tf = tf.to(device)
        pred = model(tf)
        loss = mse_loss(pred, tf.y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        error.append(loss.item())
        print("loss %0.2e"%(np.sqrt(loss.item())), end="\r")
    if epoch % 4 == 0:
        print("Epoch %i %0.2e"%(epoch, np.sqrt(np.mean(error))))

In [81]:
len(train_dataset)

193522

In [77]:
# model(tf)

In [46]:
tf = tf.to(device)
tf.device

device(type='cuda', index=0)

In [None]:
model.eval()
error = []
for tf in test_loader:
    tf = tf.to(device)
    pred = model(tf)
    loss = mae_loss(pred, tf.y)
    error.append(loss.item())
print("Test MAE %0.2e"%(np.mean(error)))
    
# print(f'Accuracy: {acc:.4f}')

In [8]:
time = pd.date_range(start='2023-01-01', periods=100, freq='D')

In [13]:
raw_df["date"] = raw_df["date"].apply(lambda x: x.replace("_", "-"))

In [14]:
pd.to_datetime(raw_df["date"])

0        2023-11-05
1        2023-11-05
2        2023-11-05
3        2023-11-05
4        2023-11-05
            ...    
241898   2024-11-16
241899   2024-11-16
241900   2024-11-16
241901   2024-11-16
241902   2024-11-16
Name: date, Length: 241903, dtype: datetime64[ns]