In [None]:
%pip install -r requirements.txt

In [None]:
import torch
from torch import nn

a = torch.randn(8, 64)
b = torch.randn(8, 64)

torch.cat((a, b), 1).shape

In [None]:
url = 
key =

In [None]:
import os
from supabase import create_client, Client
from datetime import date
import datetime

supabase: Client = create_client(url, key)

In [None]:
response = supabase.table('entries').select("*").execute()

In [None]:
response

In [None]:
response.data

In [None]:
import location

print(len(location.get_all_streets()))
print(len(location.get_all_wards()))
print(len(location.get_all_districts()))

In [None]:
location.standardize_district_name("Hà Nội")

In [None]:
def encode_street(street):
    street = location.standardize_street_name(street)
    if street not in location.get_all_streets():
        return -1
    street_encoded = location.get_all_streets().index(street)
    
    return street_encoded

def encode_ward(ward):
    ward = location.standardize_ward_name(ward)
    if ward not in location.get_all_wards():
        return -1
    ward_encoded = location.get_all_wards().index(ward)
    
    return ward_encoded

def encode_district(district):
    district = location.standardize_district_name(district)
    if district not in location.get_all_districts():
        return -1
    district_encoded = location.get_all_districts().index(district)
    
    return district_encoded

encode_street("Đại La")
encode_ward("Trương Định")
encode_district("Hai Ba Trung")

In [None]:
location.get_district_from_ward("Trương Định")

In [None]:
datas = []
for data in response.data:
    if encode_street(data["street"]) == -1:
        continue
    if encode_ward(data["ward"]) == -1:
        continue
    if encode_district(data["district"]) == -1:
        data["district"] = location.get_district_from_ward(data["ward"])
        if data["district"]:
            print(data["district"])
            datas.append(data)
            continue
    datas.append(data)

datas

In [None]:
len(datas), len(response.data)

In [None]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(datas, test_size=0.3, random_state=42)

In [None]:
len(valid_data)

In [None]:
import numpy as np

list_area = [int(data["area"]) for data in train_data]

area_mean = np.mean(list_area)
area_std = np.std(list_area)

area_mean, area_std

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class RentDataset(Dataset):
    def __init__(self, supabase_response, area_mean=None, area_std=None):
       self.data = supabase_response
       self.area_mean = area_mean
       self.area_std = area_std
       
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        price = torch.Tensor([self.data[idx]["price"]])
        if self.area_mean:
            area = (self.data[idx]["area"] - self.area_mean) / self.area_std
        else:
            area = self.data[idx]["area"]
        
        street = encode_street(self.data[idx]["street"])
        ward = encode_ward(self.data[idx]["ward"])
        district = encode_district(self.data[idx]["district"])
        
        num_bedroom = self.data[idx]["num_bedroom"]
        num_diningroom = self.data[idx]["num_diningroom"]
        num_kitchen = self.data[idx]["num_kitchen"]
        num_toilet = self.data[idx]["num_toilet"]
        
        attr = torch.Tensor([area, num_bedroom, num_diningroom, num_kitchen, num_toilet])
        
        return attr, street, ward, district, price

train_dataset = RentDataset(train_data, area_mean=area_mean, area_std=area_std)
valid_dataset = RentDataset(valid_data, area_mean=area_mean, area_std=area_std)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=len(valid_dataset), shuffle=True)

In [None]:
class RentModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding_street = nn.Embedding(1572, 128)   # 1572 streets
        self.embedding_ward = nn.Embedding(430, 128)   # 430 wards
        self.embedding_district = nn.Embedding(25, 128)   # 25 districts
        
        self.linear_attr = nn.Linear(5, 128)
        
        self.linear2 = nn.Linear(512, 729)
        self.linear3 = nn.Linear(729, 81)
        self.linear4 = nn.Linear(81, 1)
        
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        
    def forward(self, attr, street, ward, district):
        street_embeded = self.embedding_street(street)
        ward_embeded = self.embedding_ward(ward)
        district_embeded = self.embedding_district(district)
        attr_embeded = self.linear_attr(attr)
        
        x = torch.cat((street_embeded, ward_embeded, district_embeded, attr_embeded), 1)
        x = self.dropout(self.act(self.linear2(x)))
        x = self.act(self.linear3(x))
        x = self.linear4(x)
        
        return x
        

In [None]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = "cpu"
device

In [None]:
class RMSELoss(torch.nn.Module):
    def __init__(self, eps=1e-6):
        super(RMSELoss,self).__init__()
        self.eps = eps

    def forward(self,x,y):
        criterion = nn.MSELoss()
        loss = torch.sqrt(criterion(x, y) + self.eps)
        return loss

model = RentModel().to(device)
# loss_fn = nn.MSELoss()
loss_fn = RMSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

model

In [None]:
num_epochs = 20
print_per_batch = 1

epoch_count, train_loss_values, valid_loss_values = [], [], []
for epoch in range(1, num_epochs+1):
    model.train()
    
    for batch_idx, (attr, street, ward, district, price) in enumerate(train_dataloader):
        logits = model(attr, street, ward, district)
        loss = loss_fn(logits, price)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % print_per_batch == 0:
            print(
                f"Epoch: {epoch:03d}/{num_epochs:03d}"
                f" | Batch {batch_idx:03d}/{len(train_dataloader):03d}"
                f" | Train Loss: {loss}"
            )
        
        train_loss_values.append(loss.item())

    model.eval()
    with torch.inference_mode():
        for batch_idx, (attr, street, ward, district, price) in enumerate(valid_dataloader):
            logits = model(attr, street, ward, district)
            loss = loss_fn(logits, price)
            print(
                    f"Epoch: {epoch:03d}/{num_epochs:03d}"
                    f" | Batch {batch_idx:03d}/{len(valid_dataloader):03d}"
                    f" | Val Loss: {loss}"
                )
            
            valid_loss_values.append(loss.item())
            
        if len(valid_loss_values) >= 3:
            if valid_loss_values[-1] >= valid_loss_values[-2] and valid_loss_values[-2] >= valid_loss_values[-3]:
                print("Maybe Overfitting... Stop!")
                break
        

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss_values)
plt.ylabel('Training Loss')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(valid_loss_values)
plt.ylabel('Val Loss')
plt.show()

In [None]:
config = {
            "embedding_dims": 128,
            "out_feature2": 729,
            "out_feature3": 81,
            "activation": "relu",
        }

torch.save({'model_state_dict': model.state_dict(), 'config': config, "loss": valid_loss_values[-1]}, "checkpoint.pt")

In [None]:
a = torch.randn(8, 5)
b = nn.Linear(5, 128)

b(a).shape

In [None]:
path = "/home/dinhhuy/ray_results/model_tuning_2024-05-10_12-30-39/model_tuning_6fa3c_00000_0_activation=relu,embedding_dims=32,lr=0.0000,out_feature2=729,out_feature3=64_2024-05-10_12-30-39/checkpoint_000000/checkpoint.pt"

checkpoint = torch.load(path)
checkpoint

In [None]:
import pickle

with open("ckpt_path.pkl", "wb") as f:
    pickle.dump({"ckpt_path": "model/checkpoint.pt"}, f)