In [20]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import numpy as np

from sklearn.metrics import mean_squared_error, r2_score


In [21]:
X_train = np.load("X_train.npy")
y_train = np.load("y_train.npy")

X_val = np.load("X_val.npy")
y_val = np.load("y_val.npy")

train_ids = np.load("train_ids.npy")
val_ids = np.load("val_ids.npy")

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)


(2432, 15) (2432,)
(608, 15) (608,)


In [22]:
class PropertyDataset(Dataset):
    def __init__(self, X, y, ids, split):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.ids = ids
        self.split = split

        self.tf = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ])

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        img = Image.open(
            f"images/{self.split}/{self.ids[idx]}.0.png"
        ).convert("RGB")

        img = self.tf(img)

        return img, self.X[idx], self.y[idx]


In [23]:
train_ds = PropertyDataset(X_train, y_train, train_ids, "train")
val_ds   = PropertyDataset(X_val, y_val, val_ids, "train")

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=16)


In [24]:
class MultiModalModel(nn.Module):
    def __init__(self, tab_dim):
        super().__init__()

        self.cnn = models.resnet18(pretrained=True)

        for name, param in self.cnn.named_parameters():
            if "layer4" in name:
                param.requires_grad = True
            else:
                param.requires_grad = False

        self.cnn.fc = nn.Identity()

        self.tabular = nn.Sequential(
            nn.Linear(tab_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU()
        )

        self.fc = nn.Linear(512 + 128, 1)

    def forward(self, img, tab):
        img_feat = self.cnn(img)
        tab_feat = self.tabular(tab)
        return self.fc(torch.cat([img_feat, tab_feat], dim=1)).squeeze()


In [25]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = MultiModalModel(X_train.shape[1]).to(device)

optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=3e-4
)
loss_fn = nn.MSELoss()




In [None]:
for epoch in range(20):
    model.train()
    for img, tab, y in train_loader:
        img, tab, y = img.to(device), tab.to(device), y.to(device)

        optimizer.zero_grad()
        loss = loss_fn(model(img, tab), y)
        loss.backward()
        optimizer.step()

    model.eval()
    preds, gts = [], []
    with torch.no_grad():
        for img, tab, y in val_loader:
            p = model(img.to(device), tab.to(device)).cpu().numpy()
            preds.extend(p)
            gts.extend(y.numpy())

    rmse = mean_squared_error(gts, preds) ** 0.5
    r2 = r2_score(gts, preds)

    print(f"Epoch {epoch}: RMSE={rmse:.2f}, R2={r2:.3f}")


KeyboardInterrupt: 