In [1]:
import pandas as pd
import numpy as np

# Load the dataset


df = pd.read_csv('housing.csv')

In [2]:

def linear_regression(w_0, w_1, x, y):
  return w_0 + w_1*x

In [3]:
def loss_function(w_0, w_1, df):
  n = df.shape[0]
  cost = 0
  for _, row in df.iterrows():
    x, y, price = row['area'], row['price']
    cost += (linear_regression(w_0, w_1, x, y) - price) ** 2

  return cost / (2 * n)

In [4]:


def gradient_descent_step(w_0, w_1, grad_0, grad_1, learning_rate=0.001):
  w_0 -= learning_rate * grad_0
  w_1 -= learning_rate * grad_1
  return w_0, w_1

In [5]:
def grad_w_0(w_0, w_1, df):
  n = df.shape[0]
  cost = 0
  for _, row in df.iterrows():
    x, y, price = row['area'], row['price']
    cost += (linear_regression(w_0, w_1, x, y) - price)

  return cost / n


def grad_w_1(w_0, w_1, df):
  n = df.shape[0]
  cost = 0
  for _, row in df.iterrows():
    x, y, price = row['area'], row['price']
    cost += (linear_regression(w_0, w_1, x, y) - price) * x

  return cost / n


In [6]:
def normalization(data):

  mean = np.mean(data)
  value_range = np.max(data) - np.min(data)
  result = []
  for x in data:
    norm_x = (x-mean)/value_range
    result.append(norm_x)

  return result

In [7]:
def grad_descent(weights, df, iter, learning_rate=0.001, stop_factor=0.000000001):
  w_0, w_1 = weights
  loss = loss_function(w_0, w_1, df)
  loss_his = [loss]

  for i in range(iter):
    d_w_0 = grad_w_0(w_0, w_1, df)
    d_w_1 = grad_w_1(w_0, w_1, df)

    w_0, w_1 = gradient_descent_step(w_0, w_1, d_w_0, d_w_1, learning_rate=learning_rate)

    loss = loss_function(w_0, w_1, df)
    if abs(loss - loss_his[-1]) < stop_factor:
      loss_his.append(loss)
      break

    loss_his.append(loss)

  return w_0, w_1, loss_his

In [8]:
norm = pd.DataFrame()

norm['price'] = normalization(df.price)
norm['area'] = normalization(df.area)

norm.head(3)

Unnamed: 0,price,area
0,0.738811,0.155977
1,0.647902,0.261818
2,0.647902,0.330547


# **Sklearn**

In [50]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


# Load the dataset
data = pd.read_csv('housing.csv')

# Preprocess the data
X = data[['area', 'bathrooms', 'bedrooms']]
y = data['price']


scaler = StandardScaler()

# Normalize
X_normalized = scaler.fit_transform(X)
y_normalized = scaler.fit_transform(y.values.reshape(-1, 1))



X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_normalized, test_size=0.2, random_state=42)

In [None]:
print('dddddddddddd')
print(X_train)
print('dddddddddddd')
print(X_test)
print('dddddddddddd')
print(y_train)
print('dddddddddddd')
print(y_test)

In [51]:


X_analytical = np.column_stack((np.ones(len(X_normalized)), X_normalized))


print(X_analytical)

[[ 1.          1.04672629  1.42181174  1.40341936]
 [ 1.          1.75700953  5.40580863  1.40341936]
 [ 1.          2.21823241  1.42181174  0.04727831]
 ...
 [ 1.         -0.70592066 -0.57018671 -1.30886273]
 [ 1.         -1.03338891 -0.57018671  0.04727831]
 [ 1.         -0.5998394  -0.57018671  0.04727831]]


In [52]:
model = LinearRegression()
model.fit(X_train, y_train)

#model.score(X_analytical,X_train)



y_pred_sklearn = model.predict(X_train)
print('Data 1')
print(y_pred_sklearn)

# mean squared error

print("Mean Squared Error (sklearn):", mse_sklearn)

Data 1
[[ 6.90732622e-01]
 [ 9.12573913e-01]
 [-6.66887812e-01]
 [-6.97087600e-01]
 [-4.86338374e-01]
 [-2.53405019e-01]
 [-4.67851600e-01]
 [-2.41203748e-01]
 [-5.91712987e-01]
 [-6.66885991e-02]
 [-6.02805052e-01]
 [ 5.11384986e-02]
 [ 1.14481788e-01]
 [-5.19614568e-01]
 [-8.17739890e-01]
 [ 7.73923106e-01]
 [-7.35909826e-01]
 [ 7.54075912e-01]
 [ 1.92865711e-01]
 [ 2.00932788e+00]
 [ 9.23665977e-01]
 [-6.24989181e-01]
 [-2.18280148e-01]
 [ 3.73296627e-01]
 [ 7.87788187e-01]
 [-5.04825148e-01]
 [-7.00348874e-01]
 [-4.40121438e-01]
 [ 2.39167074e-01]
 [-3.29689050e-01]
 [-5.28857955e-01]
 [-1.03292594e+00]
 [-5.97259019e-01]
 [-1.29543631e-01]
 [-7.83051100e-01]
 [ 1.25085595e-01]
 [-6.45812890e-01]
 [ 7.68377074e-01]
 [ 3.45029944e-01]
 [ 1.12804527e+00]
 [ 1.56139334e+00]
 [ 5.42886693e-01]
 [-7.77806637e-02]
 [-6.47173310e-01]
 [-1.96584276e-01]
 [-5.36252664e-01]
 [ 5.10587014e-01]
 [ 9.83311912e-01]
 [-6.64366010e-01]
 [ 7.79469138e-01]
 [ 6.41835923e-02]
 [-7.83539358e-01]
 [ 1.

ValueError: Found input variables with inconsistent numbers of samples: [109, 436]

In [21]:

model = LinearRegression()

# Training
model.fit(X_train, y_train)

# Best 𝑤
w = model.coef_

# Predict prices normalized
y_pred_normalized_sklearn = model.predict(X_test)

# Invert the normalization/actual predicted prices
y_pred_sklearn = scaler_target.inverse_transform(y_pred_normalized_sklearn)

# Sean squared error
mse_sklearn = mean_squared_error(scaler_target.inverse_transform(y_test), y_pred_sklearn)