# Load the dataset 

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Load dataset from CSV
df = pd.read_csv("Boston_Housing_Dataset.csv")


In [42]:
!pip install statsmodels





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
print("Boston Housing Dataset")
display(df.head())
target_column = "MEDV" 


Boston Housing Dataset


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [16]:
print(df.columns)

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')


In [3]:
# Split features and target variable
X = df.drop(columns=[target_column])  # Features
y = df[target_column]  # Target variable

# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

R² Score: 0.6688


# Method 1: No Elimination (All Variables)

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train the model with all features
model_all = LinearRegression()
model_all.fit(X_train, y_train)

# Predictions
y_pred_all = model_all.predict(X_test)

# Evaluate
mse_all = mean_squared_error(y_test, y_pred_all)
r2_all = r2_score(y_test, y_pred_all)

print("MSE (All Variables):", mse_all)
print("R² Score (All Variables):", r2_all)

# Store coefficients for comparison later
coeff_all = model_all.coef_
print(model_all.summary())


MSE (All Variables): 24.291119474973538
R² Score (All Variables): 0.6687594935356317


AttributeError: 'LinearRegression' object has no attribute 'summary'

#  Method 2: Backward Elimination

In [9]:
def backward_elimination(X, y):
    X = X.copy()  # Copy to avoid modifying original
    while True:
        model = sm.OLS(y, X).fit()
        p_values = model.pvalues[1:]  # Ignore intercept
        max_p = p_values.max()
        if max_p > 0.05:  # If any p-value > 0.05, remove highest
            feature_to_remove = p_values.idxmax()
            X.drop(columns=[feature_to_remove], inplace=True)
            print(f"Removing: {feature_to_remove} (p={max_p:.4f})")
        else:
            break
    return X, model

# Apply backward elimination
X_train_be, model_be = backward_elimination(X_train, y_train)

# Test on reduced feature set
X_test_be = X_test[X_train_be.columns]  # Keep same columns
y_pred_be = model_be.predict(X_test_be)

# Evaluate model
r2_be = r2_score(y_test, y_pred_be)
mse_be = mean_squared_error(y_test, y_pred_be)
print(f"R² Score (Backward Elimination): {r2_be:.4f}")
print("mse ",mse_be)
print(model_be.summary())


Removing: INDUS (p=0.6302)
Removing: AGE (p=0.4699)
Removing: NOX (p=0.1203)
R² Score (Backward Elimination): 0.6126
mse  28.409176612961275
                                 OLS Regression Results                                
Dep. Variable:                   MEDV   R-squared (uncentered):                   0.961
Model:                            OLS   Adj. R-squared (uncentered):              0.960
Method:                 Least Squares   F-statistic:                              981.7
Date:                Tue, 25 Feb 2025   Prob (F-statistic):                   1.91e-271
Time:                        09:17:37   Log-Likelihood:                         -1210.1
No. Observations:                 404   AIC:                                      2440.
Df Residuals:                     394   BIC:                                      2480.
Df Model:                          10                                                  
Covariance Type:            nonrobust                              

# Method 3: Forward Selection

In [12]:
from sklearn.feature_selection import SequentialFeatureSelector

# Forward Selection using Linear Regression
model_forward = LinearRegression()
sfs_forward = SequentialFeatureSelector(model_forward, direction="forward", n_features_to_select="auto")
sfs_forward.fit(X_train, y_train)

# Get selected features
selected_features_fw = X_train.columns[sfs_forward.get_support()].tolist()

# Train model with selected features
X_train_fw = X_train[selected_features_fw]
X_test_fw = X_test[selected_features_fw]

model_fw_final = LinearRegression()
model_fw_final.fit(X_train_fw, y_train)
y_pred_fw = model_fw_final.predict(X_test_fw)

# Evaluate
mse_fw = mean_squared_error(y_test, y_pred_fw)
r2_fw = r2_score(y_test, y_pred_fw)

print("MSE (Forward Selection):", mse_fw)
print("R² Score (Forward Selection):", r2_fw)



MSE (Forward Selection): 27.342273199895953
R² Score (Forward Selection): 0.6271531070459022


# Method 4: Combined (Forward + Backward)

In [13]:
# Forward Selection Step
X_train_combined, model_combined = forward_selection(X_train, y_train)

# Backward Elimination on selected features
X_train_combined, model_combined = backward_elimination(X_train_combined, y_train)

# Test on reduced feature set
X_test_combined = sm.add_constant(X_test)[X_train_combined.columns]
y_pred_combined = model_combined.predict(X_test_combined)

mse_combined = mean_squared_error(y_test, y_pred_combined)
# Evaluate model
r2_combined = r2_score(y_test, y_pred_combined)
print(f"R² Score (Combined Method): {r2_combined:.4f}")
print("mse",mse_combined)
print(model_combined.summary())


NameError: name 'forward_selection' is not defined

# Compare Results

In [49]:
print(f"R² Scores:")
print(f" - All Variables: {r2_all:.4f}")
print(f" - Backward Elimination: {r2_be:.4f}")
print(f" - Forward Selection: {r2_fs:.4f}")
print(f" - Combined Method: {r2_combined:.4f}")

R² Scores:
 - All Variables: 0.6688
 - Backward Elimination: 0.6126
 - Forward Selection: 0.6688
 - Combined Method: 0.6523


In [51]:
import numpy as np

# Store results
results = pd.DataFrame({
    "Method": ["All Variables", "Backward Elimination", "Forward Selection", "Stepwise"],
    "MSE": [mse_all, mse_be, mse_fw, mse_combined],
    "R² Score": [r2_all, r2_be, r2_fs, r2_combined]
})

print(results)



                 Method        MSE  R² Score
0         All Variables  24.291119  0.668759
1  Backward Elimination  28.409177  0.612605
2     Forward Selection  24.291119  0.668759
3              Stepwise  25.500213  0.652272


# Compare Coefficients

In [63]:
import pandas as pd

# Dictionary to store coefficients
coefficients_dict = {}

# Function to extract coefficients from a model
def get_coefficients(model, feature_names):
    if hasattr(model, "coef_"):  # sklearn model
        coeff = pd.Series(model.coef_, index=feature_names)
        coeff["Intercept"] = model.intercept_
    elif hasattr(model, "params"):  # statsmodels model
        coeff = model.params  # statsmodels already includes intercept
    else:
        raise AttributeError("Unknown model type: No 'coef_' or 'params' attribute found.")
    return coeff

# Store coefficients for each model
coefficients_dict["All Variables"] = get_coefficients(model_all, X_train.columns)
coefficients_dict["Backward Elimination"] = get_coefficients(model_be, X_train_be.columns)
coefficients_dict["Forward Selection"] = get_coefficients(model_fs, X_train_fs.columns)
coefficients_dict["Combined Method"] = get_coefficients(model_combined, X_train_combined.columns)

# Convert dictionary to DataFrame
coefficients_df = pd.DataFrame(coefficients_dict)

# Align columns across all models
coefficients_df = coefficients_df.reindex(X_train.columns, fill_value=0)

# Display coefficient comparison
print("\nComparison of Coefficients:")
print(coefficients_df)




Comparison of Coefficients:
         All Variables  Backward Elimination  Forward Selection  \
CRIM         -0.113056             -0.099365          -0.113056   
ZN            0.030110              0.037596           0.030110   
INDUS         0.040381                   NaN           0.040381   
CHAS          2.784438              2.547578           2.784438   
NOX         -17.202633                   NaN         -17.202633   
RM            4.438835              5.820738           4.438835   
AGE          -0.006296                   NaN          -0.006296   
DIS          -1.447865             -0.887773          -1.447865   
RAD           0.262430              0.152307           0.262430   
TAX          -0.010647             -0.009323          -0.010647   
PTRATIO      -0.915456             -0.457440          -0.915456   
B             0.012351              0.016170           0.012351   
LSTAT        -0.508571             -0.475766          -0.508571   

         Combined Method  
CRIM 