# Boston Housing Regression Analysis
**Author:** Giovanna Cardenas  
**Date:** May 2025  
**Description:** This notebook builds and evaluates a multiple linear regression model to predict median home values in the Boston area using selected features from the housing dataset.

In [85]:
# Import Packages
%matplotlib inline
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from dmba import regressionSummary
from dmba import adjusted_r2_score
from dmba import forward_selection
from dmba import AIC_score
from dmba import backward_elimination
import warnings
warnings.filterwarnings('ignore')

In [87]:
# Load Data
housing= pd.read_csv('BostonHousing-1.csv')
housing.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2,1
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,5.21,28.7,0
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,12.43,22.9,0
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,19.15,27.1,0
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,29.93,16.5,0
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,17.1,18.9,0


In [89]:
# Fit a multiple linear regression model to the median house price (MEDV) as a function of CRIM, CHAS, and RM
# Select columns for regression analysis
x= housing[['CRIM', 'CHAS', 'RM']]
y= housing['MEDV']

In [91]:
# Partition data and create a regression model
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

housing_lr = LinearRegression()
housing_lr.fit(x_train, y_train)

# Print coefficients
print('Intercept ', housing_lr.intercept_)
print(pd.DataFrame({'X': x.columns, 'Coefficient': housing_lr.coef_}))

# Print performance measures
regressionSummary(y_train, housing_lr.predict(x_train))

Intercept  -27.283684087067414
      X  Coefficient
0  CRIM    -0.293142
1  CHAS     4.050572
2    RM     8.060902

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 6.4116
            Mean Absolute Error (MAE) : 4.3570
          Mean Percentage Error (MPE) : -7.8166
Mean Absolute Percentage Error (MAPE) : 23.7359


In [93]:
# Print adjusted r2
pred_y= housing_lr.predict(x_test)
print('Adjusted R2 : ', adjusted_r2_score(y_test, pred_y, housing_lr))

Adjusted R2 :  0.6712208671530389


In [95]:
# Use predict() to make predictions on the test set
housing_pred = housing_lr.predict(x_test)
result = pd.DataFrame({'Predicted': np.round(housing_pred,2), 'Actual': y_test,
                       'Residual': y_test - housing_pred})
print(result.head(10))

# Compute common accuracy measures
regressionSummary(y_test, housing_pred)

     Predicted  Actual  Residual
226      37.41    37.6  0.186050
292      26.15    27.9  1.750498
90       24.43    22.6 -1.829396
373       9.01    13.8  4.793147
273      38.70    35.2 -3.498246
417       7.87    10.4  2.532936
503      28.93    23.9 -5.031360
234      30.85    29.0 -1.853216
111      26.82    22.8 -4.015715
472      23.56    23.2 -0.358215

Regression statistics

                      Mean Error (ME) : -0.3187
       Root Mean Squared Error (RMSE) : 4.9977
            Mean Absolute Error (MAE) : 3.6312
          Mean Percentage Error (MPE) : -6.0168
Mean Absolute Percentage Error (MAPE) : 20.9753


In [97]:
# Forward Selection to potentionally improve model
def train_model1(variables):
    if len(variables) == 0:
        return None
    model1 = LinearRegression()
    model1.fit(x_train[variables], y_train)
    return model1

def score_model1(model1, variables):
    if len(variables) == 0:
        return AIC_score(y_train, [y_train.mean()] * len(y_train), model1, df=1)
    return AIC_score(y_train, model1.predict(x_train[variables]), model1)

best_model1, best_variables1 = forward_selection(x_train.columns, train_model1, score_model1, verbose = True)

print('best variables: ',best_variables1)
print(best_model1.coef_)
print(best_model1.intercept_)

Variables: CRIM, CHAS, RM
Start: score=2949.60, constant
Step: score=2721.59, add RM
Step: score=2666.07, add CRIM
Step: score=2657.85, add CHAS
Step: score=2657.85, add None
best variables:  ['RM', 'CRIM', 'CHAS']
[ 8.06090237 -0.29314194  4.05057245]
-27.28368408706738


In [99]:
# Backward Elimination to potentionally improve model
def train_model2(variables):
    model2 = LinearRegression()
    model2.fit(x_train[variables], y_train)
    return model2

def score_model2(model2, variables):
    return AIC_score(y_train, model2.predict(x_train[variables]), model2)

best_model2, best_variables2 = backward_elimination(x_train.columns, train_model2, score_model2, verbose=True)

print('best variables: ',best_variables2)
print(best_model2.coef_)
print(best_model2.intercept_)                    

Variables: CRIM, CHAS, RM
Start: score=2657.85
Step: score=2657.85, remove None
best variables:  ['CRIM', 'CHAS', 'RM']
[-0.29314194  4.05057245  8.06090237]
-27.283684087067414


In [101]:
# Print equation for predicting the median house price from the predictors in the model
print("Median Value of Owner-Occupied Homes =", end=" ")
equation = " + ".join([
    f"{coef}*{var}" for coef, var in zip(best_model2.coef_, best_variables2)])

print(equation)

Median Value of Owner-Occupied Homes = -0.29314194268266885*CRIM + 4.05057244845638*CHAS + 8.060902369227714*RM


In [103]:
# Equation for predicting the median house price from the predictors in the model:
# Median Value of Owner-Occupied Homes= -0.29314194268266885*CRIM + 4.05057244845638*CHAS + 8.060902369227714*RM

In [109]:
# What median house price is predicted for a tract in the Boston area that does not bound the Charles River,
# has a crime rate of 0.1, and where the average number of rooms per house is 6?

prediction = best_model2.predict([[0.1, 0, 6]])[0]
prediction_dollars = prediction * 1000
print(f"Predicted Outcome: ${prediction_dollars:,.2f}")

Predicted Outcome: $21,052.42
