In [30]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [31]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [32]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [33]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

In [34]:
train_file = "train.csv"
test_file = "test.csv"
sample_submission_file = "sample_submission.csv"

In [35]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

In [36]:
continuous_features = ["GrLivArea", "LotArea"]
categorical_features = ["Neighborhood", "HouseStyle"]
target = "SalePrice"

In [37]:
missing_columns = [col for col in continuous_features + categorical_features + [target] if col not in train.columns]
if missing_columns:
    raise KeyError(f"The following columns are missing from the train dataset: {missing_columns}")

In [38]:
X_train_full = train[continuous_features + categorical_features]
y_train_full = train[target]

In [39]:
X_train_full = X_train_full.fillna(X_train_full.median(numeric_only=True))
y_train_full = y_train_full.fillna(y_train_full.median())
test = test.fillna(test.median(numeric_only=True))

In [40]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [41]:
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, continuous_features),("cat", categorical_transformer, categorical_features)])

In [42]:
model = Pipeline(steps=[("preprocessor", preprocessor),("regressor", LinearRegression())])

In [43]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

In [44]:
def compute_rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)

rmsle = compute_rmsle(y_val, y_val_pred)
print("Validation RMSLE:", rmsle)

Validation RMSLE: 0.2


In [45]:
X_test = test[continuous_features + categorical_features]
test_predictions = model.predict(X_test)

In [46]:
output_file = "submission.csv"
print(f"Submission file saved as {output_file}")

Submission file saved as submission.csv
