In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
try:
    df = pd.read_csv('HousingData.csv')
except FileNotFoundError:
    print("Error: The file 'HousingData.csv' was not found.")
    print("Please check the exact name of the file you downloaded and its location.")

if 'df' in locals():

    print("\n--- First 5 rows of the data ---")
    print(df.head())
    print("\n--- Data Information (Columns, Types, Non-Null Counts) ---")
    print(df.info())


--- First 5 rows of the data ---
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90    NaN  36.2  

--- Data Information (Columns, Types, Non-Null Counts) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 

In [6]:
print(df.head())
print(df.info())
print(df.isnull().sum())

X = df.drop('MEDV', axis=1)
y = df['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90    NaN  36.2  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 non-null    float64
 3   CHAS     486 non-null    float64
 4   NOX      506 non-null    float6

In [9]:
print("\nChecking for missing values in X_train...")
print(X_train.isnull().sum().loc[X_train.isnull().sum() > 0])

train_means = X_train.mean()


X_train.fillna(train_means, inplace=True)


X_test.fillna(train_means, inplace=True)

print("\nMissing values successfully imputed. Proceeding to Model Training.")


Checking for missing values in X_train...
Series([], dtype: int64)

Missing values successfully imputed. Proceeding to Model Training.


In [10]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Performance:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R² score): {r2:.2f}")


Model Performance:
Mean Squared Error (MSE): 21.81
Root Mean Squared Error (RMSE): 4.67
R-squared (R² score): 0.71


In [12]:
from sklearn.ensemble import RandomForestRegressor
print("--- Optimizing Model with Random Forest Regressor ---")

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, rf_y_pred)

print(f"\nRandom Forest Performance:")
print(f"Mean Squared Error (MSE): {rf_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rf_rmse:.2f}")
print(f"R-squared (R² score): {rf_r2:.2f}")

best_model = rf_model

--- Optimizing Model with Random Forest Regressor ---

Random Forest Performance:
Mean Squared Error (MSE): 10.10
Root Mean Squared Error (RMSE): 3.18
R-squared (R² score): 0.86


In [14]:
import joblib

model_filename = 'house_price_prediction_rf_model.pkl'
joblib.dump(best_model, model_filename)

print(f"\nModel successfully saved to {model_filename}")


Model successfully saved to house_price_prediction_rf_model.pkl
