In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

# RMSLE function
def compute_rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(float(rmsle), precision)

print("Checking that Libraries are imported successfully.")




Checking that Libraries are imported successfully.


In [5]:

# Load dataset
DATA_PATH = Path("../data/house-prices/train.csv")  # path to load data

if DATA_PATH.exists():
    df = pd.read_csv(DATA_PATH)
    print("Loaded Kaggle train.csv")
else:
    # Fallback synthetic dataset so notebook still runs
    df = pd.DataFrame({
        "SalePrice": [200000, 185000, 250000, 140000, 300000],
        "GrLivArea": [1500, 1200, 2000, 900, 2500],
        "LotArea":   [8000, 7000, 10000, 6000, 12000],
        "Neighborhood": ["NAmes","CollgCr","Crawfor","NAmes","Somerst"],
        "HouseStyle":   ["1Story","2Story","2Story","1Story","2Story"]
    })
    print("Using synthetic dataset")

df.head()

Loaded Kaggle train.csv


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:

# Feature selection
# I have Picked 2 continuous + 2 categorical
cont_features = ["GrLivArea", "LotArea"]
cat_features = ["Neighborhood", "HouseStyle"]

target = "SalePrice"
X = df[cont_features + cat_features]
y = df[target]

print("Continuous:", cont_features)
print("Categorical:", cat_features)
X.head()

# Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)




Continuous: ['GrLivArea', 'LotArea']
Categorical: ['Neighborhood', 'HouseStyle']
Train shape: (1168, 4)
Test shape: (292, 4)


In [7]:
#  Processing
# Continuous: fill missing + scale
X_train_cont = X_train[cont_features].fillna(X_train[cont_features].median())
X_test_cont  = X_test[cont_features].fillna(X_train[cont_features].median())

scaler = StandardScaler()
X_train_cont_scaled = scaler.fit_transform(X_train_cont)
X_test_cont_scaled  = scaler.transform(X_test_cont)

# Categorical: fill missing + one-hot
X_train_cat = X_train[cat_features].fillna("Missing")
X_test_cat  = X_test[cat_features].fillna("Missing")

X_train_cat_enc = pd.get_dummies(X_train_cat, drop_first=True)
X_test_cat_enc  = pd.get_dummies(X_test_cat, drop_first=True)

# Align columns
X_train_cat_enc, X_test_cat_enc = X_train_cat_enc.align(X_test_cat_enc, join="left", axis=1, fill_value=0)

# Combine
X_train_processed = np.hstack([X_train_cont_scaled, X_train_cat_enc.values])
X_test_processed  = np.hstack([X_test_cont_scaled, X_test_cat_enc.values])

print("Processed train shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)

Processed train shape: (1168, 33)
Processed test shape: (292, 33)


In [8]:

# Model Training
model = LinearRegression()
model.fit(X_train_processed, y_train)

print("Model trained:", model.__class__.__name__)


# Evaluation
y_pred = model.predict(X_test_processed)
y_pred = np.maximum(y_pred, 1.0)  # avoid negatives for log

rmsle = compute_rmsle(y_test.values, y_pred)
print("RMSLE:", rmsle)

pd.DataFrame({"y_test": y_test.values, "y_pred": y_pred}).head(10)

Model trained: LinearRegression
RMSLE: 0.2


Unnamed: 0,y_test,y_pred
0,154500,125976.515804
1,325000,333459.352338
2,115000,107895.710439
3,159000,152981.966653
4,315500,229376.914602
5,75500,82388.57489
6,311500,219216.827666
7,146000,149229.417654
8,84500,82591.178451
9,135500,117694.793436
