In [87]:
import pandas as pd
import numpy as np

dataset_path = "data\\sberbank_russian_housing_market_price_doc.csv"

# f - house prices
# f = x1*w1 + x2*w2 + x3*w3 + x4*w4 + epsilon

# where
# x1 - floor number,
# x2 - square meter,
# x3 - the district with 5 distinct values just for the example afterwards let it be the distance from the center,
# x4 - old/new

data = pd.read_csv(dataset_path)

In [88]:
#data

In [89]:
data.fillna(value=data.mean(), inplace=True)

In [90]:
data.shape

(30471, 292)

In [91]:
target = data.price_doc.values[30000:30100]

In [92]:
data = data[['floor', 'num_room', 'kitch_sq', 'material']]

In [93]:
data = data.iloc[30000:30100, :]
data

Unnamed: 0,floor,num_room,kitch_sq,material
30000,9.0,3.0,10.0,1.0
30001,5.0,2.0,6.0,2.0
30002,16.0,1.0,22.0,4.0
30003,4.0,1.0,5.0,1.0
30004,11.0,3.0,6.0,1.0
...,...,...,...,...
30095,2.0,2.0,5.0,1.0
30096,13.0,2.0,10.0,1.0
30097,12.0,1.0,0.0,1.0
30098,14.0,3.0,9.0,1.0


In [94]:
data.columns

Index(['floor', 'num_room', 'kitch_sq', 'material'], dtype='object')

In [137]:
class MyLinearRegressor:
    def __init__(self, d, target, alpha=0.003):
        self.data = d
        self.target = target
        self.alpha = alpha
        self.m = d.shape[0]
        self.ncols = d.shape[1]
        self.W = [1] * self.ncols

    def predict(self, x):
        return (x*self.W).sum(axis=1)

    def mse(self):
        return sum((self.predict(self.data.values) - self.target) ** 2) / self.m

    def derivative(self, i):
        # derivative of mean squared error function
        return sum((self.predict(self.data) - self.target) * self.data.iloc[:, i]) / self.m

    def fit(self, num_of_iterations=100):
        for i in range(num_of_iterations):
            # for updating weights simultaneously
            new_weights = []
            for j in range(self.ncols):
                new_weights.append(self.W[j] - self.alpha * self.derivative(j))
                
            self.W = new_weights
        print(self.W)

In [121]:
optimizer = MyLinearRegressor(data, target, alpha=0.0003)

In [122]:
optimizer.fit(num_of_iterations=2000)

[184433.2615874342, 2100924.302660814, 396986.65815509815, 13114.166546209399]


In [119]:
optimizer.predict(data) - target

30000    6.556530e+05
30001   -3.467837e+06
30002    1.621142e+06
30003    1.736705e+06
30004    3.226573e+06
             ...     
30095    4.187626e+05
30096    1.082462e+06
30097    4.612926e+05
30098   -4.779167e+06
30099   -7.126787e+05
Length: 100, dtype: float64

# Test on generated data

In [124]:
bias = np.ones(100)
bias

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [125]:
# number of rooms
x1 = np.random.randint(low=1, high=20, size=100)

In [126]:
# floor
x2 = np.random.randint(low=1, high=20, size=100)

In [127]:
# square
x3 = np.random.randint(low=1, high=20, size=100)

In [128]:
# old / new
x4 = np.random.randint(low=1, high=20, size=100)

In [129]:
x4

array([16, 18, 11, 11, 13,  5,  3,  3, 13, 18,  4, 17, 18, 13, 12, 16,  5,
       10, 13, 19, 14, 17, 19,  4,  3,  1, 17,  3, 10, 18, 10, 13,  2,  2,
       13, 18, 10,  6,  4, 14, 14,  2,  9, 15,  4, 14, 16, 16,  9, 17,  7,
        9, 19,  6, 10,  8, 11, 11,  8,  1,  3, 10,  2,  8,  5, 17, 12,  9,
       17, 17, 19,  4, 19, 11,  2, 11, 11,  4,  8,  9,  2, 10, 19, 10,  3,
       17, 19, 12, 16,  7,  8, 13,  7, 16,  6, 14, 13, 13,  2,  3])

In [130]:
w0 = 1
w1 = 7
w2 = 3
w3 = 9
w4 = 6

In [138]:
X = pd.DataFrame(data=np.column_stack((bias, x1, x2, x3, x4)))
X

Unnamed: 0,0,1,2,3,4
0,1.0,19.0,7.0,19.0,16.0
1,1.0,16.0,12.0,7.0,18.0
2,1.0,16.0,4.0,5.0,11.0
3,1.0,14.0,13.0,15.0,11.0
4,1.0,10.0,12.0,2.0,13.0
...,...,...,...,...,...
95,1.0,7.0,13.0,4.0,14.0
96,1.0,7.0,10.0,13.0,13.0
97,1.0,8.0,15.0,9.0,13.0
98,1.0,12.0,19.0,18.0,2.0


In [139]:
y = w0 + w1*x1 + w2*x2 + w3*x3 + w4*x4

In [140]:
lr = MyLinearRegressor(X, y)

In [142]:
lr.fit(num_of_iterations=100)

[1.4490753836920731, 6.990790873640839, 2.9898322346531274, 8.990509747838058, 5.988663518296523]
