Boston House price prediction: Using the provided dataset
containing features such as a number of rooms, crime rates, and
other relevant factors, design and implement a regression model to
accurately predict Boston house prices. Your solution should
involve data preprocessing, model selection, training, and
evaluation

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import math

Data Preprocessing

In [10]:
#Load the dataset
df=pd.read_csv("HousingData.csv")
df.shape

(506, 14)

In [11]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [12]:
#Sum of null data or missing
df.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [13]:
#Drop null data
df.dropna(inplace=True)
df.shape

(394, 14)

In [15]:
#Feature and target
X=df.drop(columns=["MEDV"])
y=df["MEDV"]

In [None]:
X.head() #Feature: all are independent

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21


In [None]:
y.head() #Target: dependent on features

0    24.0
1    21.6
2    34.7
3    33.4
5    28.7
Name: MEDV, dtype: float64

In [None]:
#Normalize feature
scalar=StandardScaler()
X_scaled=scalar.fit_transform(X)

In [19]:
#Split data
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)

Model Selection

In [20]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

Training and Evaluation of Models

In [25]:
results={}
for name,model in models.items():
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    r2=r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    results[name] = {"R2 Score": r2, "RMSE": rmse}
    print(f"\n{name}:\nR2 Score: {r2:.4f}\nRMSE: {rmse:.4f}")


Linear Regression:
R2 Score: 0.6271
RMSE: 5.6084

Decision Tree:
R2 Score: 0.6780
RMSE: 5.2114

Gradient Boosting:
R2 Score: 0.6938
RMSE: 5.0824


Choosing Best Model

In [33]:
best_model_name=max(results,key=lambda k: results[k]["R2 Score"])
best_model=models[best_model_name]
y_predict=best_model.predict(X_test)
best_model

Output

In [34]:
best_model.score(X_test,y_test)

0.6937525136156195

In [35]:
print(X.columns.tolist())

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [39]:
sample_input = np.array([[0.56, 18.0, 2.31, 0, 0.538, 1000.575, 65.2, 4.09, 1, 296, 15.3, 396.9, 4.98]])
sample_scaled = scalar.transform(sample_input)
predicted_price = best_model.predict(sample_scaled)
print(f"Predicted House Price: ${predicted_price[0] * 1000:.2f}")


Predicted House Price: $40654.30


