# Load the Dataset

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Boston_Housing_Dataset.csv")  # Ensure the correct file path

# Display the first few rows
df.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


# Define Features (X) and Target Variable (y)

In [4]:
# Define X (independent variables) and y (dependent variable)
X = df.drop(columns=["MEDV"])  # All features except target
y = df["MEDV"]  # Target variable
from sklearn.model_selection import train_test_split




# Split the Dataset

In [None]:
# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Multiple Linear Regression Without Elimination

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train the model with all features
model_all = LinearRegression()
model_all.fit(X_train, y_train)

# Predictions
y_pred_all = model_all.predict(X_test)

# Evaluate
mse_all = mean_squared_error(y_test, y_pred_all)
r2_all = r2_score(y_test, y_pred_all)

print("MSE (All Variables):", mse_all)
print("R² Score (All Variables):", r2_all)

# Store coefficients for comparison later
coeff_all = model_all.coef_


MSE (All Variables): 24.291119474973538
R² Score (All Variables): 0.6687594935356317


# Backward Elimination

In [6]:
import statsmodels.api as sm

# Add intercept for statsmodels
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Fit model
model_be = sm.OLS(y_train, X_train_sm).fit()
print(model_be.summary())

# Identify important features (p-value threshold < 0.05)
selected_features_be = X_train.columns[model_be.pvalues[1:] < 0.05].tolist()

# Train a new model with selected features
X_train_be = X_train[selected_features_be]
X_test_be = X_test[selected_features_be]

model_be_final = LinearRegression()
model_be_final.fit(X_train_be, y_train)
y_pred_be = model_be_final.predict(X_test_be)

# Evaluate
mse_be = mean_squared_error(y_test, y_pred_be)
r2_be = r2_score(y_test, y_pred_be)

print("MSE (Backward Elimination):", mse_be)
print("R² Score (Backward Elimination):", r2_be)


                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.743
Method:                 Least Squares   F-statistic:                     90.43
Date:                Mon, 24 Feb 2025   Prob (F-statistic):          6.21e-109
Time:                        18:32:14   Log-Likelihood:                -1194.3
No. Observations:                 404   AIC:                             2417.
Df Residuals:                     390   BIC:                             2473.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.2468      5.677      5.328      0.0

# Forward Selection

In [7]:
from sklearn.feature_selection import SequentialFeatureSelector

# Forward Selection using Linear Regression
model_forward = LinearRegression()
sfs_forward = SequentialFeatureSelector(model_forward, direction="forward", n_features_to_select="auto")
sfs_forward.fit(X_train, y_train)

# Get selected features
selected_features_fw = X_train.columns[sfs_forward.get_support()].tolist()

# Train model with selected features
X_train_fw = X_train[selected_features_fw]
X_test_fw = X_test[selected_features_fw]

model_fw_final = LinearRegression()
model_fw_final.fit(X_train_fw, y_train)
y_pred_fw = model_fw_final.predict(X_test_fw)

# Evaluate
mse_fw = mean_squared_error(y_test, y_pred_fw)
r2_fw = r2_score(y_test, y_pred_fw)

print("MSE (Forward Selection):", mse_fw)
print("R² Score (Forward Selection):", r2_fw)



MSE (Forward Selection): 27.342273199895953
R² Score (Forward Selection): 0.6271531070459022


In [9]:
!pip install mlxtend



Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
     ---------------------------------------- 1.4/1.4 MB 1.7 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.4



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


#  Combined Method (Stepwise Regression)

In [10]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression

# Initialize Linear Regression model
model = LinearRegression()

# Perform Stepwise Selection (combining forward and backward)
sfs_combined = SFS(model, 
                   forward=True, 
                   floating=True,  # Allows dynamic feature addition/removal
                   scoring='r2',  # Optimize R² score
                   cv=5)  # Cross-validation for stability

# Fit to training data
sfs_combined.fit(X_train, y_train)

# Get selected features
selected_features_combined = list(sfs_combined.k_feature_names_)

# Train model with selected features
X_train_combined = X_train[selected_features_combined]
X_test_combined = X_test[selected_features_combined]

model_combined_final = LinearRegression()
model_combined_final.fit(X_train_combined, y_train)
y_pred_combined = model_combined_final.predict(X_test_combined)

# Evaluate performance
from sklearn.metrics import mean_squared_error, r2_score

mse_combined = mean_squared_error(y_test, y_pred_combined)
r2_combined = r2_score(y_test, y_pred_combined)

print("Selected Features (Stepwise):", selected_features_combined)
print("MSE (Stepwise):", mse_combined)
print("R² Score (Stepwise):", r2_combined)


Selected Features (Stepwise): ['LSTAT']
MSE (Stepwise): 33.51954917268489
R² Score (Stepwise): 0.5429180422970384


In [18]:
# Forward Selection Step
X_train_combined, model_combined = forward_selection(X_train, y_train)

# Backward Elimination on selected features
X_train_combined, model_combined = backward_elimination(X_train_combined, y_train)

# Test on reduced feature set
X_test_combined = sm.add_constant(X_test)[X_train_combined.columns]
y_pred_combined = model_combined.predict(X_test_combined)

mse_combined = mean_squared_error(y_test, y_pred_combined)
# Evaluate model
r2_combined = r2_score(y_test, y_pred_combined)
print(f"R² Score (Combined Method): {r2_combined:.4f}")
print(model_combined.summary())


NameError: name 'forward_selection' is not defined

# Compare Results

In [11]:
import numpy as np

# Store results
results = pd.DataFrame({
    "Method": ["All Variables", "Backward Elimination", "Forward Selection", "Stepwise"],
    "MSE": [mse_all, mse_be, mse_fw, mse_combined],
    "R² Score": [r2_all, r2_be, r2_fw, r2_combined]
})

print(results)


                 Method        MSE  R² Score
0         All Variables  24.291119  0.668759
1  Backward Elimination  25.500213  0.652272
2     Forward Selection  27.342273  0.627153
3              Stepwise  33.519549  0.542918


#  Compare Coefficients

In [13]:
# Compare coefficients
# Initialize a dictionary for coefficients comparison
coefficients_dict = {"Feature": X.columns}

# Store coefficients for each method, ensuring alignment with selected features
coefficients_dict["All Variables"] = model_all.coef_

# Add coefficients for Backward Elimination
if len(selected_features_be) > 0:
    coeff_be = pd.Series(model_be_final.coef_, index=selected_features_be)
    coefficients_dict["Backward Elimination"] = coeff_be.reindex(X.columns, fill_value=0)
else:
    coefficients_dict["Backward Elimination"] = None

# Add coefficients for Forward Selection
if len(selected_features_fw) > 0:
    coeff_fw = pd.Series(model_fw_final.coef_, index=selected_features_fw)
    coefficients_dict["Forward Selection"] = coeff_fw.reindex(X.columns, fill_value=0)
else:
    coefficients_dict["Forward Selection"] = None

# Add coefficients for Stepwise Selection
if len(selected_features_combined) > 0:
    coeff_combined = pd.Series(model_combined_final.coef_, index=selected_features_combined)
    coefficients_dict["Stepwise"] = coeff_combined.reindex(X.columns, fill_value=0)
else:
    coefficients_dict["Stepwise"] = None

# Convert dictionary to DataFrame
coefficients_df = pd.DataFrame(coefficients_dict)

# Display
print(coefficients_df)


         Feature  All Variables  Backward Elimination  Forward Selection  \
CRIM        CRIM      -0.113056             -0.108568           0.000000   
ZN            ZN       0.030110              0.000000           0.000000   
INDUS      INDUS       0.040381              0.000000           0.000000   
CHAS        CHAS       2.784438              2.875249           0.000000   
NOX          NOX     -17.202633            -17.953929         -15.117126   
RM            RM       4.438835              4.463746           4.799560   
AGE          AGE      -0.006296              0.000000           0.000000   
DIS          DIS      -1.447865             -1.234960          -1.200277   
RAD          RAD       0.262430              0.238116           0.000000   
TAX          TAX      -0.010647             -0.007795           0.000000   
PTRATIO  PTRATIO      -0.915456             -1.012242          -0.941397   
B              B       0.012351              0.012274           0.013169   
LSTAT      L