In [1]:
# Imports

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error


In [2]:
data = pd.read_csv("housing.csv")
data.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [3]:
data["income_cat"] = pd.cut(
    data["median_income"],
    bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5]
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in split.split(data, data["income_cat"]):
    train_set = data.loc[train_idx].drop("income_cat", axis=1)
    test_set = data.loc[test_idx].drop("income_cat", axis=1)


In [4]:
X_train = train_set.drop("median_house_value", axis=1)
y_train = train_set["median_house_value"]

num_attrs = X_train.drop("ocean_proximity", axis=1).columns.tolist()
cat_attrs = ["ocean_proximity"]


In [5]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", cat_pipeline, cat_attrs)
])

X_train_prepared = full_pipeline.fit_transform(X_train)


In [6]:
def evaluate_model(model, X, y, name):
    model.fit(X, y)
    preds = model.predict(X)

    rmse = np.sqrt(mean_squared_error(y, preds))
    mae = mean_absolute_error(y, preds)

    print(f"{name}")
    print(f"RMSE: {rmse}")
    print(f"MAE : {mae}")
    print("-" * 30)


In [7]:
lin_reg = LinearRegression()
evaluate_model(lin_reg, X_train_prepared, y_train, "Linear Regression")


Linear Regression
RMSE: 69050.56219504567
MAE : 49905.329442715316
------------------------------


In [8]:
dt_reg = DecisionTreeRegressor(random_state=42)
evaluate_model(dt_reg, X_train_prepared, y_train, "Decision Tree")


Decision Tree
RMSE: 0.0
MAE : 0.0
------------------------------


In [9]:
rf_reg = RandomForestRegressor()

evaluate_model(rf_reg, X_train_prepared, y_train, "Random Forest")


Random Forest
RMSE: 18431.514504865234
MAE : 11809.353840843023
------------------------------


In [10]:
rf_scores = cross_val_score(
    rf_reg,
    X_train_prepared,
    y_train,
    scoring="neg_mean_squared_error",
    cv=5
)

rf_rmse_scores = np.sqrt(-rf_scores)

print("Random Forest CV RMSE Mean:", rf_rmse_scores.mean())
print("Random Forest CV RMSE Std :", rf_rmse_scores.std())


Random Forest CV RMSE Mean: 49918.39363775185
Random Forest CV RMSE Std : 793.9283248353557
