In [170]:
import pandas as pd
import numpy as np
import math
from pprint import pprint
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

#### Fetching data from .csv file 

In [171]:
def fetchdata(filename) -> str:
    df = pd.read_csv(filename)
    data = []
    label = []
    for i in df.values:
        data.append(i[1:-1])
        label.append(i[-1:])
    data = np.array(data)
    ones = np.ones((data.shape[0], 1)) #adding a column of 1 in the first column
    data = np.hstack((ones, data))
    label = np.concatenate(label, axis=0)
    label = label.tolist()
    return data,label

`Loading` and 
`Spliting` data into training and testing

In [172]:
datas,labels = fetchdata('Real estate.csv')
train_features, test_features, train_labels, test_labels = train_test_split(datas, labels, test_size=0.2, random_state=42)

`Linear Regression`

In [173]:
class LinearRegression:
    def training(rawtrain,train_label):
        X = np.array(rawtrain)
        theta = np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)),X.T),train_label)
        return theta
    def predict(rawtest,theta):
        X = np.array(rawtest)
        return np.dot(X,theta)
    def evaluation(predict,label):
        RSS =  np.sum((label - predict)**2)
        mean_y = np.mean(label)
        TSS = np.sum((label - mean_y)**2)
        r2 =1-(RSS/TSS)
        RMSE = math.sqrt(RSS/len(label))
        print(f"""\nResidual sum of square = {RSS}\nTotal Sum of squares = {TSS}""")
        print(f"""Coefficient of determinaton = {r2}\nRoot Mean Squared Error = {RMSE}""")
        return  r2 , RMSE

#### Using Linear Regression

In [174]:
theta = LinearRegression.training(train_features,train_labels)
y_predicted = (LinearRegression.predict(test_features,theta))

r2 , rmse = LinearRegression.evaluation(y_predicted,test_labels)



Residual sum of square = 4440.966347342221
Total Sum of squares = 13924.058795180725
Coefficient of determinaton = 0.6810580583817063
Root Mean Squared Error = 7.314753491586532


### The `limitations` of the normal equation approach for linear regression are:
##### 1. Computationally expensive for large datasets.
##### 2. Numerically unstable, if the input features are linearly dependent.
##### 3. Faces Singular matrix issues i.e equation may be singular or close to singular, which can lead to numerical instability and inaccurate results.
##### 4. Can overfit the data if the input features are highly correlated or if there are too many features relative to the number of examples.
##### 5. Cannot find Non-linear decision boundaries.
##### 6. Outliers can have a large impact, which can lead to overfitting and poor generalization to new data.