In [12]:
import pandas as pd
from sklearn.datasets import fetch_openml

import numpy as np

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error, root_mean_squared_error
from sklearn.pipeline import make_pipeline

In [2]:
# The Boston Housing dataset contains 13 features
data = fetch_openml(name='boston', version=1, as_frame=True)
df = data.frame
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CRIM     506 non-null    float64 
 1   ZN       506 non-null    float64 
 2   INDUS    506 non-null    float64 
 3   CHAS     506 non-null    category
 4   NOX      506 non-null    float64 
 5   RM       506 non-null    float64 
 6   AGE      506 non-null    float64 
 7   DIS      506 non-null    float64 
 8   RAD      506 non-null    category
 9   TAX      506 non-null    float64 
 10  PTRATIO  506 non-null    float64 
 11  B        506 non-null    float64 
 12  LSTAT    506 non-null    float64 
 13  MEDV     506 non-null    float64 
dtypes: category(2), float64(12)
memory usage: 49.0 KB


In [4]:
categorical_cols = df.select_dtypes(include=['category']).columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
if not categorical_cols.empty:
    # Use one-hot encoding for categorical columns
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_data = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(
        encoded_data,
        columns=encoder.get_feature_names_out(categorical_cols)
    )
    # Concatenate encoded columns with numeric columns
    df = pd.concat([df[numeric_cols], encoded_df], axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 21 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   NOX      506 non-null    float64
 4   RM       506 non-null    float64
 5   AGE      506 non-null    float64
 6   DIS      506 non-null    float64
 7   TAX      506 non-null    float64
 8   PTRATIO  506 non-null    float64
 9   B        506 non-null    float64
 10  LSTAT    506 non-null    float64
 11  MEDV     506 non-null    float64
 12  CHAS_1   506 non-null    float64
 13  RAD_2    506 non-null    float64
 14  RAD_24   506 non-null    float64
 15  RAD_3    506 non-null    float64
 16  RAD_4    506 non-null    float64
 17  RAD_5    506 non-null    float64
 18  RAD_6    506 non-null    float64
 19  RAD_7    506 non-null    float64
 20  RAD_8    506 non-null    float64
dtypes: float64(21)
m

In [9]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,...,MEDV,CHAS_1,RAD_2,RAD_24,RAD_3,RAD_4,RAD_5,RAD_6,RAD_7,RAD_8
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.09,296.0,15.3,396.9,...,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242.0,17.8,396.9,...,21.6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,...,34.7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,...,33.4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222.0,18.7,396.9,...,36.2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X = df.drop('MEDV', axis=1)  # MEDV is the target variable (median house price)
y = df['MEDV']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
model = LinearRegression()

# Fitting the model to the training data
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred) # optimal martix
mae = mean_absolute_error(y_test, y_pred)#
r2 = r2_score(y_test, y_pred)

# Output results
print("Model Coefficients (β):", model.coef_)
print("Model Intercept (β₀):", model.intercept_)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R² Score:", r2)

Model Coefficients (β): [-1.0206471   0.90347401  0.39666668 -2.11325929  3.03783782 -0.21282023
 -3.26435606 -1.0450346  -2.01247038  1.14747983 -3.61172505  0.66768997
  0.23826661  2.5445001   1.27506664  0.7174283   1.13259694  0.08974418
  0.82237882  0.91875444]
Model Intercept (β₀): 22.796534653465343
Mean Squared Error (MSE): 24.818442738481263
Mean Absolute Error (MAE): 3.30708476366646
R² Score: 0.6615687658684816
