# 0. Rough Data Process

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


# Load the dataset from the relative path
data = pd.read_csv("data/train.csv")

# Handle missing values for numerical columns by filling them with the median
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
for col in numerical_cols:
    median_value = data[col].median()
    data[col].fillna(median_value, inplace=True)

# Handle missing values for categorical columns by filling them with the mode
categorical_cols = data.select_dtypes(include=["object"]).columns.tolist()
for col in categorical_cols:
    mode_value = data[col].mode()[0]
    data[col].fillna(mode_value, inplace=True)

# Convert date columns to datetime objects
date_cols = ["original_reg_date", "reg_date", "lifespan"]
for col in date_cols:
    data[col] = pd.to_datetime(data[col], errors="coerce")

# Handle any remaining missing values in date columns by filling with a default date
default_date = pd.Timestamp("1900-01-01")
data[date_cols] = data[date_cols].fillna(default_date)

# Encode categorical variables using Label Encoding
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Save the processed data to a new CSV file
data.to_csv("data/rough_processed.csv", index=False)
print("\nProcessed data saved to 'data/rough_processed.csv'")

# 1. Linear Regression

## 1.1 Ridge Regression

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


data = pd.read_csv("data/rough_processed.csv")

X = data.drop(["price", "listing_id","indicative_price"], axis=1)
y = data["price"]

feature_columns = [col for col in X.columns if col != "listing_id"]

preprocessor = ColumnTransformer(
transformers=[
    ("feature", StandardScaler(), feature_columns),
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

for alpha in [0.1, 0.5, 1.0, 1.5, 2.0, 5.0]:
    gb_reg = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", Ridge(alpha=alpha))
    ])

    gb_reg.fit(X_train, y_train)

    y_pred_valid = gb_reg.predict(X_valid)
    valid_rmse = root_mean_squared_error(y_valid, y_pred_valid)

    print(f"Ridge Regression with alpha={alpha}")
    print(f" - Validation RMSE: {valid_rmse:.4f}")
    if alpha != 5.0:
        print()

## 1.2 Lasso Regression

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


data = pd.read_csv("data/rough_processed.csv")

X = data.drop(["price", "listing_id","indicative_price"], axis=1)
y = data["price"]

feature_columns = [col for col in X.columns if col != "listing_id"]

preprocessor = ColumnTransformer(
transformers=[
    ("feature", StandardScaler(), feature_columns),
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

for alpha in [0.1, 0.5, 1.0, 1.5, 2.0, 5.0]:
    gb_reg = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", Lasso(alpha=alpha))
    ])

    gb_reg.fit(X_train, y_train)

    y_pred_valid = gb_reg.predict(X_valid)
    valid_rmse = root_mean_squared_error(y_valid, y_pred_valid)

    print(f"Lasso Regression with alpha={alpha}")
    print(f" - Validation RMSE: {valid_rmse:.4f}")
    if alpha != 5.0:
        print()

## 1.3 Elastic Net

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


data = pd.read_csv("data/rough_processed.csv")

X = data.drop(["price", "listing_id","indicative_price"], axis=1)
y = data["price"]

numerical_columns = [col for col in X.columns if col != "listing_id"]

preprocessor = ColumnTransformer(
transformers=[
    ("num", StandardScaler(), numerical_columns),
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

gb_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", ElasticNet(alpha=0.1, l1_ratio=0.9))
])

gb_reg.fit(X_train, y_train)

y_pred_valid = gb_reg.predict(X_valid)
valid_rmse = root_mean_squared_error(y_valid, y_pred_valid)

print(f"Elastic Net with alpha=0.1 and l1_ratio=0.9")
print(f" - Validation RMSE: {valid_rmse:.4f}")

# 2. Gradient Boosting

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor


data = pd.read_csv("data/rough_processed.csv")

X = data.drop(["price", "listing_id","indicative_price"], axis=1)
y = data["price"]

feature_columns = [col for col in X.columns if col != "listing_id"]

preprocessor = ColumnTransformer(
transformers=[
    ("feature", StandardScaler(), feature_columns),
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

gb_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor())
])

gb_reg.fit(X_train, y_train)

y_pred_valid = gb_reg.predict(X_valid)
valid_rmse = root_mean_squared_error(y_valid, y_pred_valid)

print(f"Gradient Boosting Regression")
print(f" - Validation RMSE: {valid_rmse:.4f}")

# 3. Deep Learning Method

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os


class Reg_Dataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X)
        self.y = torch.tensor(y).squeeze()
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]


class Reg_MLP(nn.Module):
    def __init__(self, n_dim):
        super().__init__()
        self.fc1 = nn.Linear(n_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x


torch.random.manual_seed(0)

data = pd.read_csv("data/rough_processed.csv")
X = data.drop(["price", "listing_id","indicative_price"], axis=1).to_numpy(dtype=np.float32)
y = data["price"].to_numpy(dtype=np.float32)

scaler = StandardScaler()
X_standarded = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=0)

train_dataset = Reg_Dataset(X_train, y_train)
val_dataset = Reg_Dataset(X_val, y_val)

BATCH_SIZE = 32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = Reg_MLP(X_train.shape[1]).to(DEVICE)
criterion = nn.MSELoss().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

EPOCH = 100
PATIENCE = 10
best_val_loss = float("inf")
cnt = 0

for i in range(EPOCH):
    model.train()
    train_loss = []

    for inputs, labels in train_dataloader:
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        del inputs, labels, outputs, loss
    
    model.eval()
    val_loss = []
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs = inputs.to(DEVICE)
            labels = labels.to(DEVICE)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            val_loss.append(loss.item())
            del inputs, labels, outputs, loss

    if (i + 1) % 5 == 0:
        torch.cuda.empty_cache()
    
    avg_train_loss = np.sqrt(np.mean(train_loss))
    avg_val_loss = np.sqrt(np.mean(val_loss))

    if (i + 1) % 10 == 0:
        print(f"Epoch {i+1}/{EPOCH} -- train loss: {avg_train_loss:.4f}, val loss: {avg_val_loss:.4f} ")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        cnt = 0
    else:
        cnt += 1
        if cnt >= PATIENCE:
            print(f"\n*****  Early Stop at Epoch {i + 1}  *****\n")
            break

model.eval()
with torch.no_grad():
    predictions = []
    groundtruth = []
    for inputs, labels in val_dataloader:
        inputs = inputs.to(DEVICE)
        outputs = model(inputs).squeeze()
        outputs = outputs.detach().cpu()
        predictions.extend(outputs.squeeze().tolist())
        groundtruth.extend(labels.squeeze().tolist())
        del inputs, labels, outputs

save_dir = "models/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
torch.save(model.state_dict(), os.path.join(save_dir, "mlp_weights.pth"))
del model, criterion, optimizer
torch.cuda.empty_cache()

predictions = np.asarray(predictions)
groundtruth = np.asarray(groundtruth)

valid_rmse = np.sqrt(np.mean((predictions - groundtruth) ** 2))
print("\nMulti-layer Perceptron Regression")
print(f" - Validation RMSE: {valid_rmse:.4f}")