<a href="https://colab.research.google.com/github/Neilus03/Restb.ai_challenge-HackUPC2023/blob/main/MLP_HackUPC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import numpy as np
import pickle
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from google.colab import drive
import matplotlib.pyplot as plt

In [None]:
# If this cell fails you need to change the runtime of your colab notebook to GPU
# Go to Runtime -> Change Runtime Type and select GPU
assert torch.cuda.is_available(), "GPU is not enabled"

# use gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
Big_Data = pd.read_pickle("/content/drive/MyDrive/hackupc2023_restbai.pkl")
#Big_Data = pd.read_json("/content/hackupc2023_restbai__dataset_sample.json")

In [None]:
Big_Data = Big_Data.T

In [None]:
city2number         = {key: indice for indice, key in enumerate(Big_Data["city"].unique())}
neighborhood2number = {key: indice for indice, key in enumerate(Big_Data["neighborhood"].unique())}
region2number       = {key: indice for indice, key in enumerate(Big_Data["region"].unique())}

In [None]:
Big_Data['city']         = Big_Data['city'].map(city2number)
Big_Data['neighborhood'] = Big_Data['neighborhood'].map(neighborhood2number)
Big_Data['region']       = Big_Data['region'].map(region2number)

In [None]:
class DatasetFromPD(Dataset):
    def __init__(self, df):
        self.Dataset = df.drop(columns = ["summary", "images", "image_data", "property_type", "num_images"])
        self.Dataset = self.Dataset.fillna(0)

        self.Dataset_Normalize = (self.Dataset - self.Dataset.mean()) / self.Dataset.std()

        self.features = torch.tensor(self.Dataset_Normalize.drop(columns = ["price"]).values, dtype=torch.float32)
        self.target   = torch.tensor(self.Dataset_Normalize['price'].values, dtype=torch.float32)        

    def __len__(self):
        return len(self.features)   #Numero de filas del dataset

    def get_mean(self):
        return self.Dataset.mean()

    def get_std(self):
        return self.Dataset.std()

    def __getitem__(self, idx):
        feature = self.features[idx]
        target  = self.target[idx]
        return feature, target

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.input_size = input_size
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size * 2),
            nn.ReLU(),
            nn.Linear(hidden_size * 2, hidden_size * 2),
            nn.ReLU(),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, output_size))

    def forward(self, x):
        x = x.view(-1, self.input_size)
        return self.network(x)

In [None]:
from sklearn.model_selection import train_test_split

# df is your DataFrame
train_data, valid_data = train_test_split(Big_Data, test_size=0.2, random_state=42)

train_dataset = DatasetFromPD(train_data)
val_dataset   = DatasetFromPD(valid_data)

In [None]:
# Create the Torch DataLoader
batch_size = 264

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, drop_last=True)
val_loader   = DataLoader(val_dataset,   shuffle=False, batch_size=batch_size,  drop_last=True)

In [None]:
input_dim = 6
hidden_size = 512
output_dim = 1

MLP_model = MLP(input_dim, hidden_size, output_dim).to(device)

In [None]:
import math
def initialize_parameters(model):
    for name, w in model.named_parameters():
        if "weight" in name:
            nn.init.xavier_uniform_(w, gain = math.sqrt(2)) #Using xavier also known as Glorot initialization.
            
        if "bias" in name:
            nn.init.zeros_(w)

In [None]:
learning_rate = 1e-3
lambda_l2 = 1e-5

optimizer = torch.optim.SGD(MLP_model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=lambda_l2) # built-in L2
criterion = torch.nn.MSELoss() 
initialize_parameters(MLP_model)

In [None]:
def train(epoch, criterion, model, optimizer, loader):
    
    total_loss = 0.0

    model.train()

    for batch_idx, (data, target) in enumerate(loader):
      
        optimizer.zero_grad()

        data, target = data.to(device), target.to(device)

        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        # print loss every N iterations
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(loader.dataset),
                100. * batch_idx / len(loader), loss.item()))

        total_loss += loss.item() 

    return total_loss / len(loader.dataset)


In [None]:
@torch.no_grad()  # prevent this function from computing gradients see https://pytorch.org/docs/stable/generated/torch.no_grad.html
def validate(criterion, model, loader):

    val_loss = 0

    model.eval()

    for data, target in loader:

        data, target = data.to(device), target.to(device)
        
        output = model(data)
        loss = criterion(output, target)
        val_loss += loss.item()

    val_loss /= len(loader.dataset)
    print('\nValidation set: Average loss: {:.4f}\n'.format(val_loss))

    return val_loss


In [None]:
losses = {"train": [], "val": []}
for epoch in range(30):

    train_loss = train(epoch, criterion, MLP_model, optimizer, train_loader)
    val_loss = validate(criterion, MLP_model, val_loader)
    losses["train"].append(train_loss)
    losses["val"].append(val_loss)

    plt.plot(losses["train"], label="training loss")
    plt.plot(losses["val"], label="validation loss")

    plt.legend()
    plt.pause(0.000001)
    plt.show()

In [None]:
def check_images_prices(df, model):
  samples = df.sample(264)
  samples_string = samples[["price", "summary", "images", "image_data", "property_type", "num_images"]]
  samples_int    = samples.drop(columns = ["summary", "images", "image_data", "property_type", "num_images"])

  samples_int_std  = samples_int.std()
  samples_int_mean = samples_int.mean()


  normalized_samples = (samples_int - samples_int_mean) / samples_int_std
  normalized_samples = normalized_samples.drop(columns = ["price"]).values.astype(np.float32)
  normalized_samples = torch.tensor(normalized_samples, dtype = torch.float32).to(device)

  predicted_normalized = model(normalized_samples).cpu()
  predicted_denormalized = predicted_normalized * torch.tensor(samples_int_std.price, dtype=torch.float32) + torch.tensor(samples_int_mean.price, dtype=torch.float32)
  truth_values = torch.tensor(samples_string["price"].values.astype(np.float32), dtype = torch.float32)

  return predicted_denormalized.squeeze(), truth_values, samples_string["images"]

predicted_denormalized, truth_values, images = check_images_prices(Big_Data, MLP_model)

In [None]:
predicted_denormalized, truth_values = predicted_denormalized.int().detach().numpy(), truth_values.int().detach().numpy()
difference_predicted_ground_truth = predicted_denormalized - truth_values

In [None]:
df = pd.DataFrame({
    'Predicted Value': predicted_denormalized,
    'Correct   Value': truth_values,
    'image': images,
    'difference': abs(difference_predicted_ground_truth)
})

In [None]:
df

Unnamed: 0,Predicted Value,Correct Value,image,difference
34244,248281,43000,[https://restb-hackathon.s3.amazonaws.com/real...,205281
132981,245173,340000,[https://restb-hackathon.s3.amazonaws.com/real...,94827
127720,241726,69900,[https://restb-hackathon.s3.amazonaws.com/real...,171826
628286,245304,195000,[https://restb-hackathon.s3.amazonaws.com/real...,50304
515141,245043,135000,[https://restb-hackathon.s3.amazonaws.com/real...,110043
...,...,...,...,...
268124,245436,62000,[https://restb-hackathon.s3.amazonaws.com/real...,183436
661781,244938,375000,[https://restb-hackathon.s3.amazonaws.com/real...,130062
487453,246168,169000,[https://restb-hackathon.s3.amazonaws.com/real...,77168
799135,242465,99000,[https://restb-hackathon.s3.amazonaws.com/real...,143465
