# 0. Rough Data Process

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


# Load the dataset from the relative path
data = pd.read_csv("data/train.csv")

# Handle missing values for numerical columns by filling them with the median
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
for col in numerical_cols:
    median_value = data[col].median()
    data[col].fillna(median_value, inplace=True)

# Handle missing values for categorical columns by filling them with the mode
categorical_cols = data.select_dtypes(include=["object"]).columns.tolist()
for col in categorical_cols:
    mode_value = data[col].mode()[0]
    data[col].fillna(mode_value, inplace=True)

# Convert date columns to datetime objects
date_cols = ["original_reg_date", "reg_date", "lifespan"]
for col in date_cols:
    data[col] = pd.to_datetime(data[col], errors="coerce")

# Handle any remaining missing values in date columns by filling with a default date
default_date = pd.Timestamp("1900-01-01")
data[date_cols] = data[date_cols].fillna(default_date)

# Encode categorical variables using Label Encoding
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Save the processed data to a new CSV file
data.to_csv("data/rough_processed.csv", index=False)
print("\nProcessed data saved to 'data/rough_processed.csv'")

# 1. Linear Regression

## 1.1 Ridge Regression

In [11]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


data = pd.read_csv("data/rough_processed.csv")

X = data.drop("price", axis=1)
X = X.drop("indicative_price", axis=1)
y = data["price"]

numerical_columns = [col for col in X.columns if col != "listing_id"]

preprocessor = ColumnTransformer(
transformers=[
    ("num", StandardScaler(), numerical_columns),
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

for alpha in [0.1, 0.5, 1.0, 1.5, 2.0, 5.0]:
    gb_reg = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", Ridge(alpha=alpha))
    ])

    gb_reg.fit(X_train, y_train)

    y_pred_valid = gb_reg.predict(X_valid)
    valid_rmse = root_mean_squared_error(y_valid, y_pred_valid)

    print(f"Ridge Regression with alpha={alpha}")
    print(f" - Validation RMSE: {valid_rmse}")
    if alpha != 5.0:
        print()

Ridge Regression with alpha=0.1
 - Validation RMSE: 44444.84984507896

Ridge Regression with alpha=0.5
 - Validation RMSE: 44444.29648157189

Ridge Regression with alpha=1.0
 - Validation RMSE: 44443.607248066626

Ridge Regression with alpha=1.5
 - Validation RMSE: 44442.920746978

Ridge Regression with alpha=2.0
 - Validation RMSE: 44442.23696446152

Ridge Regression with alpha=5.0
 - Validation RMSE: 44438.19059563258


## 1.2 Lasso Regression

In [12]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


data = pd.read_csv("data/rough_processed.csv")

X = data.drop("price", axis=1)
X = X.drop("indicative_price", axis=1)
y = data["price"]

numerical_columns = [col for col in X.columns if col != "listing_id"]

preprocessor = ColumnTransformer(
transformers=[
    ("num", StandardScaler(), numerical_columns),
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

for alpha in [0.1, 0.5, 1.0, 1.5, 2.0, 5.0]:
    gb_reg = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", Lasso(alpha=alpha))
    ])

    gb_reg.fit(X_train, y_train)

    y_pred_valid = gb_reg.predict(X_valid)
    valid_rmse = root_mean_squared_error(y_valid, y_pred_valid)

    print(f"Lasso Regression with alpha={alpha}")
    print(f" - Validation RMSE: {valid_rmse}")
    if alpha != 5.0:
        print()

Lasso Regression with alpha=0.1
 - Validation RMSE: 44444.87806403614

Lasso Regression with alpha=0.5
 - Validation RMSE: 44444.43679305127

Lasso Regression with alpha=1.0
 - Validation RMSE: 44443.88617189834

Lasso Regression with alpha=1.5
 - Validation RMSE: 44443.336834182104

Lasso Regression with alpha=2.0
 - Validation RMSE: 44442.789811524046

Lasso Regression with alpha=5.0
 - Validation RMSE: 44439.5246486033


## 1.3 Elastic Net

In [13]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


data = pd.read_csv("data/rough_processed.csv")

X = data.drop("price", axis=1)
X = X.drop("indicative_price", axis=1)
y = data["price"]

numerical_columns = [col for col in X.columns if col != "listing_id"]

preprocessor = ColumnTransformer(
transformers=[
    ("num", StandardScaler(), numerical_columns),
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

gb_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", ElasticNet(alpha=0.1, l1_ratio=0.9))
])

gb_reg.fit(X_train, y_train)

y_pred_valid = gb_reg.predict(X_valid)
valid_rmse = root_mean_squared_error(y_valid, y_pred_valid)

print(f"Elastic Net with alpha=0.1 and l1_ratio=0.9")
print(f" - Validation RMSE: {valid_rmse}")

Elastic Net with alpha=0.1 and l1_ratio=0.9
 - Validation RMSE: 44291.586553100904


# 2. Gradient Boosting

In [14]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor


data = pd.read_csv("data/rough_processed.csv")

X = data.drop("price", axis=1)
X = X.drop("indicative_price", axis=1)
y = data["price"]

numerical_columns = [col for col in X.columns if col != "listing_id"]

preprocessor = ColumnTransformer(
transformers=[
    ("num", StandardScaler(), numerical_columns),
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

gb_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor())
])

gb_reg.fit(X_train, y_train)

y_pred_valid = gb_reg.predict(X_valid)
valid_rmse = root_mean_squared_error(y_valid, y_pred_valid)

print(f"Gradient Boosting Regression")
print(f" - Validation RMSE: {valid_rmse}")

Gradient Boosting Regression
 - Validation RMSE: 29170.49124293392


# 3. Deep Learning Method