In [197]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

You can find my datasets as zip file in my github by [link](https://github.com/MaratMedvedev/Data-mining-lab-project/tree/main/Preprocessed%20datasets).

In [198]:
# !unzip /content/datasets.zip

In [199]:
df = pd.read_csv('prepared_dataset_sale_estate.csv')
df.columns

Index(['Unnamed: 0', 'price', 'beds', 'livings', 'wc', 'area', 'street_width',
       'age', 'ketchen', 'furnished', 'location.lat', 'location.lng',
       'category_2', 'category_3', 'category_6', 'category_7', 'category_9',
       'category_10', 'category_12', 'category_20', 'category_21',
       'category_22', 'street_direction_0.0', 'street_direction_1.0',
       'street_direction_2.0', 'street_direction_3.0', 'street_direction_4.0',
       'street_direction_5.0', 'street_direction_6.0', 'street_direction_7.0',
       'street_direction_8.0', 'street_direction_9.0',
       'street_direction_10.0'],
      dtype='object')

In [200]:
df = df.drop('Unnamed: 0', axis=1)

In [201]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

I experiment with truncating dataset to remove outliers. I create dataFrames with different percentages of most and least values that was truncated from the dataset based on 'price' column.

In [202]:
def truncate_extreme_values(df, x):
    """
    Truncates x-th percentage of most and least values from the 'price' column in the DataFrame.

    Parameters:
    df (DataFrame): Input DataFrame containing the 'price' column.
    x (float): Percentage of values to truncate from both ends.

    Returns:
    DataFrame: DataFrame with x-th percentage of most and least values truncated from the 'price' column.
    """
    lower_threshold = df['price'].quantile(x / 100)
    upper_threshold = df['price'].quantile(1 - (x / 100))

    truncated_df = df[(df['price'] >= lower_threshold) & (df['price'] <= upper_threshold)]

    return truncated_df

In [203]:
df = truncate_extreme_values(df, 5)

In [204]:
def normalize_data(df, numeric_columns):
    scaler = StandardScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    return df
def normalize_price(x):
    return np.log(x) / 20
def undo_normalization_price(x):
  return np.exp(x*20)

In [205]:
numeric_columns = ['beds', 'livings', 'wc', 'area', 'street_width', 'age', 'location.lat', 'location.lng']

df_normalized = normalize_data(df.copy()[df['price']>100], numeric_columns)
for col in df.columns:
  df_normalized[col] = df_normalized[col].astype(float)

In [206]:
df_normalized['price'] = df_normalized['price'].apply(normalize_price)

In [207]:
y = df_normalized['price']
X = df_normalized.drop('price', axis=1)
random_seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [208]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [209]:
X_train = torch.from_numpy(X_train.values.astype(np.float32)).to(device)
X_test = torch.from_numpy(X_test.values.astype(np.float32)).to(device)
y_train = torch.from_numpy(y_train.values.astype(np.float32)).view(-1, 1).to(device)
y_test = torch.from_numpy(y_test.values.astype(np.float32)).view(-1, 1).to(device)

In [213]:
input_size = X_train.shape[1]
model = NeuralNetwork(input_size).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

In [214]:
# Training the model
num_epochs = 30000
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    if epoch % 1000 == 0:
      print(f'Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [0/30000], Loss: 0.8753
Epoch [1000/30000], Loss: 0.0112
Epoch [2000/30000], Loss: 0.0093
Epoch [3000/30000], Loss: 0.0089
Epoch [4000/30000], Loss: 0.0085
Epoch [5000/30000], Loss: 0.0083
Epoch [6000/30000], Loss: 0.0082
Epoch [7000/30000], Loss: 0.0081
Epoch [8000/30000], Loss: 0.0079
Epoch [9000/30000], Loss: 0.0079
Epoch [10000/30000], Loss: 0.0077
Epoch [11000/30000], Loss: 0.0069
Epoch [12000/30000], Loss: 0.0068
Epoch [13000/30000], Loss: 0.0065
Epoch [14000/30000], Loss: 0.0064
Epoch [15000/30000], Loss: 0.0064
Epoch [16000/30000], Loss: 0.0062
Epoch [17000/30000], Loss: 0.0064
Epoch [18000/30000], Loss: 0.0063
Epoch [19000/30000], Loss: 0.0062
Epoch [20000/30000], Loss: 0.0061
Epoch [21000/30000], Loss: 0.0061
Epoch [22000/30000], Loss: 0.0063
Epoch [23000/30000], Loss: 0.0061
Epoch [24000/30000], Loss: 0.0060
Epoch [25000/30000], Loss: 0.0060
Epoch [26000/30000], Loss: 0.0060
Epoch [27000/30000], Loss: 0.0060
Epoch [28000/30000], Loss: 0.0062
Epoch [29000/30000], Loss: 

In [215]:
# Evaluating the model
model.eval()
with torch.no_grad():
    predicted = model(X_test)
    test_loss = criterion(torch.exp(predicted*20), torch.exp(y_test*20))
    print(f'Test Loss: {test_loss.item():.4f}')

Test Loss: 196139.0469
