# List of project's dependencies
Below are listed imports that are needed for program to work properly commented with required install (see also requirements.txt)

In [None]:
#List of imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error
from functions import fill_na, normalize_numerical_columns

# Loading data
To properly load data you need to download dataset from [link](https://www.kaggle.com/datasets/krzysztofjamroz/apartment-prices-in-poland/data)
and paste it to `./data` directory.

In [None]:
#apartment rental data
price_data_array : list = [
    './data/apartments_pl_2024_04.csv' 
]

price_data_array_rent : list = [
    './data/apartments_rent_pl_2023_11.csv', 
    './data/apartments_rent_pl_2023_12.csv',
    './data/apartments_rent_pl_2024_01.csv', 
    './data/apartments_rent_pl_2024_02.csv',
    './data/apartments_rent_pl_2024_03.csv', 
    './data/apartments_rent_pl_2024_04.csv' 
]

data = pd.concat([pd.read_csv(data_set)for data_set in price_data_array])
data.head()

Unnamed: 0,id,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,...,pharmacyDistance,ownership,buildingMaterial,condition,hasParkingSpace,hasBalcony,hasElevator,hasSecurity,hasStorageRoom,price
0,312f2788e54076b94f950baaeced3e54,szczecin,blockOfFlats,91.0,4.0,2.0,4.0,1932.0,53.465463,14.593251,...,0.182,condominium,brick,premium,no,no,no,no,yes,512000
1,23d558163bb1c7863c73485048e50858,szczecin,,53.8,2.0,5.0,5.0,,53.429429,14.554642,...,0.073,cooperative,,,no,no,yes,no,no,400000
2,cb5a5f936e4486afd41637860100b033,szczecin,tenement,130.0,4.0,1.0,3.0,1930.0,53.429368,14.552341,...,0.114,condominium,brick,low,no,yes,no,no,yes,975000
3,7d0c31d5409caab173571cce3dcdf702,szczecin,blockOfFlats,68.61,3.0,4.0,4.0,1997.0,53.456213,14.583222,...,0.304,condominium,brick,,no,yes,no,no,yes,599000
4,6d947e9a6521b8e608d3fcd6ff2f089a,szczecin,blockOfFlats,80.0,3.0,1.0,2.0,2017.0,53.495272,14.590178,...,0.34,condominium,brick,premium,yes,yes,no,yes,no,999999


# Preparing data
Functions to prepare data

In [None]:
data=data.drop('id', axis=1)

numerical_columns = ['squareMeters', 'rooms', 'floor', 'floorCount', 'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount', 'schoolDistance', 'clinicDistance', 'postOfficeDistance', 'kindergartenDistance', 'restaurantDistance', 'collegeDistance', 'pharmacyDistance']
categorical_columns = ['city', 'type', 'ownership', 'buildingMaterial', 'condition']
boolean_columns = ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']
drop_columns = []
output_column = 'price'


data = data.drop(drop_columns, axis=1)

fill_na(data, numerical_columns, 'mean')
fill_na(data, boolean_columns, 'false')

data = pd.get_dummies(data, columns=categorical_columns)
data = pd.get_dummies(data, columns=boolean_columns, drop_first=True).astype(int)

normalize_numerical_columns(data, numerical_columns)

data

Unnamed: 0,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,centreDistance,poiCount,schoolDistance,...,buildingMaterial_brick,buildingMaterial_concreteSlab,condition_low,condition_premium,hasParkingSpace_yes,hasBalcony_yes,hasElevator_no,hasElevator_yes,hasSecurity_yes,hasStorageRoom_yes
0,0.528,0.6,0.035714,0.107143,0.471264,0.8,0.000000,0.3125,0.033175,0.00,...,1,0,0,1,0,0,1,0,0,1
1,0.224,0.2,0.142857,0.142857,0.787356,0.8,0.000000,0.0000,0.284360,0.00,...,0,0,0,0,0,0,0,1,0,0
2,0.840,0.6,0.000000,0.071429,0.459770,0.8,0.000000,0.0000,0.317536,0.00,...,1,0,1,0,0,1,1,0,0,1
3,0.344,0.4,0.107143,0.107143,0.844828,0.8,0.000000,0.1875,0.033175,0.00,...,1,0,0,0,0,1,1,0,0,1
4,0.440,0.4,0.000000,0.035714,0.959770,0.8,0.000000,0.5000,0.018957,0.25,...,1,0,0,1,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19254,0.648,0.8,0.035714,0.035714,0.787356,0.8,0.333333,0.0000,0.203791,0.00,...,1,0,0,0,0,1,1,0,0,0
19255,0.584,0.4,0.071429,0.071429,0.431034,0.8,0.444444,0.0000,0.232227,0.00,...,1,0,0,0,0,0,1,0,0,1
19256,0.664,0.8,0.035714,0.107143,0.224138,0.8,0.444444,0.0625,0.132701,0.00,...,1,0,0,0,0,0,1,0,0,1
19257,0.200,0.2,0.000000,0.000000,0.787356,0.8,0.444444,0.0000,0.199052,0.00,...,1,0,0,0,1,0,1,0,0,0


### Drop least relevant data

In [None]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound)&(data['price'] <= upper_bound)]

### Divide data into tensors

In [None]:
X = data.drop('price', axis=1)
Y = data['price']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train = torch.tensor(np.array(X_train), dtype=torch.float32).to(device)
X_test  = torch.tensor(np.array(X_test), dtype=torch.float32).to(device)
Y_train = torch.tensor(np.array(Y_train), dtype=torch.float32).reshape(-1, 1).to(device)
Y_test  = torch.tensor(np.array(Y_test), dtype=torch.float32).reshape(-1, 1).to(device)

### MLP model 
Multilayer Perceptron 

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        self.fc4 = nn.Linear(hidden_size3, hidden_size4)
        self.fc5 = nn.Linear(hidden_size4, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        return x

In [None]:
# Initialize the model
input_size = X_train.shape[1]
hidden_size1 = 100
hidden_size2 = 150
hidden_size3 = 70
hidden_size4 = 20
output_size = 1

model = MLP(input_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4, output_size).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adadelta(model.parameters(), lr=1.0)

In [None]:
#Training the model
num_epochs = 100
batch_size = 100

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train[i:i+batch_size]
        batch_y = Y_train[i:i+batch_size]

        # Forward pass
        outputs = model(batch_X).to(device)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print progress
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    y_pred_tensor = model(X_test)
    test_loss = criterion(y_pred_tensor, Y_test)
    print('Test Loss:', test_loss.item())

# Convert predictions and true labels back to numpy arrays
y_pred_tensor_cpu = y_pred_tensor.to("cpu")
y_pred = y_pred_tensor_cpu.numpy()
y_true = Y_test.to("cpu").numpy()

In [None]:
#Calculate evaluation metrics
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)