In [6]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, roc_auc_score, log_loss
from sklearn.preprocessing import MinMaxScaler
from time import time
import numpy as np

from utils.pipeline import create_pipeline
from utils.data_cleaning import load_and_clean

In [48]:
df = load_and_clean() #Load and clean dataset

In [71]:
X = df.drop('price', axis=1) #Features

y = df['price'] #Target

In [72]:
pipeline = create_pipeline(df) #Create pipeline for linear regression

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #Split data into training and testing sets

In [74]:
#debug
print("x_train shape:", X_train.shape)
print("x_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("Missing values in x_train:", X_train.isnull().sum().sum())
print("Missing values in x_test:", X_test.isnull().sum().sum())

x_train shape: (49683, 41)
x_test shape: (12421, 41)
y_train shape: (49683,)
Missing values in x_train: 107776
Missing values in x_test: 26785


In [75]:
# Fit the pipeline and predict
start = time()
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
execution_time = time() - start

In [83]:
#Evaluation
mse = mean_squared_error(y_test, y_pred)

y_binary = y_test > y_test.median()
y_pred_binary = y_pred > y_test.median()

accuracy = accuracy_score(y_binary, y_pred_binary)
f1 = f1_score(y_binary, y_pred_binary)
roc_auc = roc_auc_score(y_binary, y_pred)

# Log loss requires probabilities between 0 and 1
y_pred_proba = MinMaxScaler().fit_transform(y_pred.reshape(-1, 1))
logloss_value = log_loss(y_binary, y_pred_proba)

# Cross-validation scores
cv_scores = cross_val_score(
    pipeline, X, y, cv=5, scoring='neg_mean_squared_error'
)




In [84]:
# Output results
print(f"Execution Time: {execution_time:.2f}s")
print(f"MSE: {mse:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")
print(f"Log Loss: {logloss_value:.2f}")
print(f"Cross-Validation MSE: {cv_scores.mean():.2f}")

Execution Time: 4.85s
MSE: 13084.55
Accuracy: 0.82
F1 Score: 0.82
ROC-AUC: 0.90
Log Loss: 0.85
Cross-Validation MSE: -16703.46
