In [24]:
from pathlib import Path
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error

In [None]:
ROOT = Path.cwd()
DATA_PATH = ROOT/'data'/'raw'/'housing.csv'

df = pd.read_csv(DATA_PATH)

# feature engineering

In [15]:
df_featured = df.copy()

df_featured["rooms_per_household"] = df_featured["total_rooms"] / df_featured["households"]

df_featured["bedrooms_per_room"] = df_featured["total_bedrooms"] / df_featured["total_rooms"]

df_featured["population_per_household"] = df_featured["population"] / df_featured["households"]

In [16]:

corr_matrix = df_featured.select_dtypes(include=np.number).corr()

print(corr_matrix["median_house_value"].sort_values(ascending=False))

median_house_value          1.000000
median_income               0.688075
rooms_per_household         0.151948
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049686
population_per_household   -0.023737
population                 -0.024650
longitude                  -0.045967
latitude                   -0.144160
bedrooms_per_room          -0.255880
Name: median_house_value, dtype: float64


# Linear regression pipeline

In [17]:
model_df = df_featured[df_featured['median_house_value'] < p96(target)]


X = model_df.drop('median_house_value', axis= 1).copy()
y= model_df['median_house_value'].copy()

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=52)

num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='unknown'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, make_column_selector(dtype_include='number')),
        ('cat', cat_pipeline, make_column_selector(dtype_include='object'))
    ], remainder='drop'
)

linear_regression_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]
)

linear_regression_model = linear_regression_pipeline.fit(X_train,y_train)



In [22]:
bl = y_train.mean()
baseline = np.full(shape=len(y_test), fill_value=bl)


baseline_error = root_mean_squared_error(y_test,baseline)
R2_baseline = r2_score(y_test, baseline)

y_pred = linear_regression_pipeline.predict(X_test)

RMSE = root_mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
R2 = r2_score(y_test, y_pred)


print(baseline_error)
print(R2_baseline)

print(RMSE)
print(MAE)
print(R2)

98434.71247654197
-0.0001480784195764251
60629.15370081177
44363.4620582845
0.6205707887845778
