* Reg. No: 24-27-05
* Name: Mahboob Alam
* Data Science

--------------------------------------------------------------------------

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score

### Load the dataset

In [2]:
df = pd.read_csv('Housepriceprediction.csv')

In [3]:
df.head()

Unnamed: 0,Id,LotArea,OverallQual,1stFlrSF,GrLivArea,TotRmsAbvGrd,GarageCars,GarageArea,SalePrice
0,1,8450,7,856,1710,8,2,548,208500
1,2,9600,6,1262,1262,6,2,460,181500
2,3,11250,7,920,1786,6,2,608,223500
3,4,9550,7,961,1717,7,3,642,140000
4,5,14260,8,1145,2198,9,3,836,250000


### Correlation Matrix

In [4]:
df.corr()

Unnamed: 0,Id,LotArea,OverallQual,1stFlrSF,GrLivArea,TotRmsAbvGrd,GarageCars,GarageArea,SalePrice
Id,1.0,-0.033226,-0.028365,0.010496,0.008273,0.027239,0.01657,0.017634,-0.021917
LotArea,-0.033226,1.0,0.105806,0.299475,0.263116,0.190015,0.154871,0.180403,0.263843
OverallQual,-0.028365,0.105806,1.0,0.476224,0.593007,0.427452,0.600671,0.562022,0.790982
1stFlrSF,0.010496,0.299475,0.476224,1.0,0.566024,0.409516,0.439317,0.489782,0.605852
GrLivArea,0.008273,0.263116,0.593007,0.566024,1.0,0.825489,0.467247,0.468997,0.708624
TotRmsAbvGrd,0.027239,0.190015,0.427452,0.409516,0.825489,1.0,0.362289,0.337822,0.533723
GarageCars,0.01657,0.154871,0.600671,0.439317,0.467247,0.362289,1.0,0.882475,0.640409
GarageArea,0.017634,0.180403,0.562022,0.489782,0.468997,0.337822,0.882475,1.0,0.623431
SalePrice,-0.021917,0.263843,0.790982,0.605852,0.708624,0.533723,0.640409,0.623431,1.0


In [5]:
X = df.iloc[:, 1:8]
y = df.iloc[:, 8]

### train, test, split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

### Sklearn Linear Regression Implementation

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
reg = LinearRegression()
reg.fit(X_train, y_train)

y_test_pred = reg.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)

y_train_pred = reg.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)

In [9]:
print("Coefficient of the model: ", reg.coef_)
print("Intercept of the model: ", reg.intercept_)

Coefficient of the model:  [ 6.21611012e-01  2.64285222e+04  2.28947647e+01  3.58131307e+01
 -5.37804999e+02  1.80014152e+04  1.04702988e+01]
Intercept of the model:  -102459.18039758943


In [10]:
print("Train R2 Score (Scikit-Learn):", train_r2)
print("Test R2 Score (Scikit-Learn):", test_r2)

Train R2 Score (Scikit-Learn): 0.7466869932211081
Test R2 Score (Scikit-Learn): 0.7865534601474461


### Normal Equation Implementation

In [11]:
class NormalEquation:

    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self,X_train,y_train):
        X_train = np.insert(X_train,0,1,axis=1)

        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]

    def predict(self,X_test):
        y_pred = np.dot(X_test,self.coef_) + self.intercept_
        return y_pred

In [12]:
ne = NormalEquation()

In [13]:
ne.fit(X_train, y_train)
y_test_pred_ne = ne.predict(X_test)
test_r2_ne = r2_score(y_test,y_test_pred_ne)

y_train_pred = ne.predict(X_train)
train_r2_ne = r2_score(y_train, y_train_pred)

In [14]:
print("The Coefficient values are: ", ne.coef_)
print("The Intercept value is: ", ne.intercept_)

The Coefficient values are:  [ 6.21611012e-01  2.64285222e+04  2.28947647e+01  3.58131307e+01
 -5.37804999e+02  1.80014152e+04  1.04702988e+01]
The Intercept value is:  -102459.18039755838


In [15]:
print("Train R2 Score (Normal Equation):", train_r2_ne)
print("Test R2 Score (Normal Equation):", test_r2_ne)

Train R2 Score (Normal Equation): 0.7466869932211081
Test R2 Score (Normal Equation): 0.7865534601474289


### Gradient Descent Implementation

In [None]:
class GradientDescentRegressor:
    def __init__(self, learning_rate=0.001, epochs=1000):
        self.lr = learning_rate
        self.epochs = epochs
        self.m = None
        self.b = None

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        if isinstance(y, pd.Series):
            y = y.to_numpy()
        
        # Normalize features for numerical stability
        self.X_mean = np.mean(X, axis=0)
        self.X_std = np.std(X, axis=0)
        X = (X - self.X_mean) / (self.X_std + 1e-8)
        
        # Reshape y if needed
        if len(y.shape) == 1:
            y = y.reshape(-1, 1)
        
        m_samples, n_features = X.shape
        self.m = np.zeros((n_features, 1))
        self.b = 0

        for _ in range(self.epochs):
            y_pred = np.dot(X, self.m) + self.b
            
            # Compute gradients
            loss_slope_m = (-2 / m_samples) * np.dot(X.T, (y - y_pred))
            loss_slope_b = (-2 / m_samples) * np.sum(y - y_pred)
            
            # Update parameters with gradient descent
            self.m -= self.lr * loss_slope_m
            self.b -= self.lr * loss_slope_b

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        
        # Apply same normalization as in training
        X = (X - self.X_mean) / (self.X_std + 1e-8)
        predictions = np.dot(X, self.m) + self.b
        return predictions.reshape(-1)

In [17]:
gdr = GradientDescentRegressor(learning_rate=0.1, epochs=50)
gdr.fit(X_train, y_train)
y_test_pred_gd = gdr.predict(X_test)
test_r2_gd = r2_score(y_test, y_test_pred_gd)

y_train_pred = gdr.predict(X_train)
train_r2_gd = r2_score(y_train, y_train_pred)

In [18]:
print("The Theta values are: ", gdr.m)
print("The Intercept value is: ", gdr.b)

The Theta values are:  [[ 7084.41976986]
 [36517.53943573]
 [ 9080.12234658]
 [17283.67230261]
 [  511.86830305]
 [11677.44821281]
 [ 3773.97587015]]
The Intercept value is:  181310.1049780715


In [19]:
print("Train R2 Score (Gradient Descent):", train_r2_gd)
print("Test R2 Score (Gradient Descent):", test_r2_gd)

Train R2 Score (Gradient Descent): 0.7464981108852599
Test R2 Score (Gradient Descent): 0.7864488167634827


# R2 Score:
| Method                  | Train R² Score | Test R² Score |
|-------------------------|---------------|--------------|
| Gradient Descent       | 0.7465        | 0.7864       |
| Normal Equation        | 0.7465        | 0.7864       |
| Scikit - learn        | 0.7465        | 0.7864       |

# High Correlation Features: 
    1. OverallQual (0.790982) :
        This feature has the highest correlation with house price, indicating that the overall quality of a house  plays a crucial role in determining its value.

    2. GrLivArea (0.708624): 
        The above-ground living area is highly correlated with SalePrice, suggesting that larger living spaces tend to have higher prices.

    3. GarageCars (0.640409):
        The number of cars a garage can accommodate significantly affects house prices, highlighting the importance of parking space.

    4. GarageArea (0.623431):
        The total area of the garage also has a strong correlation with SalePrice, further reinforcing the significance of garage size.

# The final learning of the model: 
    The final model learned is a linear regression model that predicts SalePrice based on the available features using  Gradient Descent, Normal Equation and Scikit-learn implementation. Since all implementations resulted in the same    R² score, we can confirm that the model is consistent. Since R² scores for train (0.746) and test (0.786) are close, the model generalizes well.