In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [3]:
# Load the training data
train_data = pd.read_csv('/Users/admin-20218/Downloads/house-prices-advanced-regression-techniques/train.csv')

# Load the test data
test_data = pd.read_csv('/Users/admin-20218/Downloads/house-prices-advanced-regression-techniques/test.csv')

# Load the sample submission file
sample_submission = pd.read_csv('/Users/admin-20218/Downloads/house-prices-advanced-regression-techniques/sample_submission.csv')

In [4]:
# Print the first few rows of the dataset to verify that it loaded correctly
print(train_data.head(10))
print(sample_submission.head(10))

scaler = StandardScaler()
scaled_sample_submission = scaler.fit_transform(sample_submission)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   
5   6          50       RL         85.0    14115   Pave   NaN      IR1   
6   7          20       RL         75.0    10084   Pave   NaN      Reg   
7   8          60       RL          NaN    10382   Pave   NaN      IR1   
8   9          50       RM         51.0     6120   Pave   NaN      Reg   
9  10         190       RL         50.0     7420   Pave   NaN      Reg   

  LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1         Lvl    AllPub  ...       

In [5]:
# Data setup
X_train = train_data.drop(["Id", "SalePrice"], axis=1)
y_train = train_data["SalePrice"]
X_test = test_data.drop("Id", axis=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [6]:
# Feature selection
continuous_features = ["LotArea", "GrLivArea"]
categorical_features = ["MSZoning", "Neighborhood"]

In [7]:
# Feature processing
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown="ignore")

X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_val[continuous_features] = scaler.transform(X_val[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

X_train_processed = encoder.fit_transform(X_train[categorical_features])
X_val_processed = encoder.transform(X_val[categorical_features])
X_test_processed = encoder.transform(X_test[categorical_features])


In [8]:
# Model training
model = LinearRegression()
model.fit(np.hstack((X_train[continuous_features], X_train_processed.toarray())), y_train)


In [9]:
# Model evaluation
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

y_pred = model.predict(np.hstack((X_val[continuous_features], X_val_processed.toarray())))
rmse = sqrt(mean_squared_error(y_val, y_pred))
rmsle = compute_rmsle(np.log(y_val), np.log(y_pred))


In [10]:
print("RMSE:", rmse)
print("RMSLE:", rmsle)

RMSE: 42645.40610193754
RMSLE: 0.02
