In [88]:
import numpy as np
from sympy import symbols, diff
import math
SEED = 17
np.random.seed(SEED) #makes the random numbers predictable

Создадим вручную класс модели линейной регрессии. Объекту класса можно подавать разные параметры и менять модель.

In [113]:
class SimpleLinearRegression:
    def __init__(self, step = 0.01, tol = 1e-4, max_iter=2000, verbose=False, random_state=SEED):
        self.max_iter = max_iter # max iter count of gradient descent
        self.step = step # step of descent in the direction of antigradient
        self.tol = tol # we compare norm of gradient with that threshold
        self._w = None # w_1
        self._intercept = None # w_0
        self.random_state = random_state 
        self.verbose = verbose
    
    def predict(self, X):
        """
        estimate target variable "y" based on features X 
        """
        y_pred = self._w * X + self._intercept
        assert y_pred.shape[0] == X.shape[0]
        return y_pred
    
    def score(self, X, y):
        """
        MSE
        X - features
        y - true values of target variable
        """
        return np.mean((y - self.predict(X))**2)
    
    def _gradient(self, X, y):
        """
        Compute gradient of MSE subject to w_1, w_0
        X - features
        y - true values of target variable
        """
        grad_intercept = -np.mean(y-self._w*X-self._intercept)
        grad_w = -np.mean((y-self._w*X-self._intercept)*X)
        return grad_w, grad_intercept
    
    def fit(self, X, y):
        """
        Train model with gradient descent
        X - features
        y - true values of target variable
        """
        # for reproducable results
        np.random.seed(self.random_state)
        
        # initialize weights
        self._w, self._intercept = np.random.randn(2)
#        print(self._w, self._intercept)
        # perform gradient descent
        for iter in range(self.max_iter):
            # compute gradient at current W
            grad_w, grad_intercept = self._gradient(X, y)
#            print(grad_w, grad_intercept)
            # make step, update W
            self._w -= self.step * grad_w
            self._intercept -= self.step * grad_intercept
            
            # compute gradient norm            
            grad_norm = math.sqrt(grad_w**2 + grad_intercept**2)
            
            # people like to watch how the error is reducing during iterations 
            if self.verbose:
                mse_score = self.score(X, y)
                print(iter, mse_score, grad_norm)
                
            # compare gradient norm with threshold
            if grad_norm < self.tol:
                print('model converged')
                return self
        print('model did not converge')
        return self

Для Boston dataset мы хотим научится предсказывать значение target по признаку CRIM.

In [93]:
import pandas as pd
from sklearn import datasets

In [94]:
boston_data = datasets.load_boston() #data is presented as a dictionary
boston_data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

Представим данные в виде датафрейма.

In [95]:
df = pd.DataFrame(boston_data['data'], columns=boston_data['feature_names'])
df['target'] = boston_data['target']
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


Посмотрим на описание данных.

In [6]:
print(boston_data['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [7]:
# реализуем функцию, которая считает MSE 
def mse_score(y_true, y_pred):
    """
    y_true - true values of target variable
    y_pred - predicted values of target variable 
    """
    result = np.mean((y_true - y_pred)**2)
    return result

Разделим наш датасет на 2 части: на первой части мы будем обучаться, на второй оценивать качество работы алгоритма на новых данных.

In [96]:
from sklearn.model_selection import train_test_split

# разбили датасет в соотношении 60:40
df_train, df_test = train_test_split(df, test_size=0.4, random_state=SEED, shuffle=True)

Построим и ообучим предсказывающую модель.

In [114]:
model = SimpleLinearRegression(verbose=True)

In [115]:
model.fit(df_train['CRIM'], df_train['target'])

0 628.3695075298089 43.87810412550131
1 617.5357623351317 24.368769448045136
2 608.5900017657832 21.309670260690854
3 599.9597012563474 20.827029974127207
4 591.4854120719277 20.62722786561055
5 583.1491711373696 20.45744458963037
6 574.9472107002172 20.2919017897747
7 566.8772156860807 20.127985551672246
8 558.9370476137532 19.96542237701176
9 551.1246163539228 19.804175062596226
10 543.4378667983856 19.644230329463028
11 535.8747770400587 19.48557738995869
12 528.4333577130735 19.328205784148917
13 521.1116514566846 19.172105160857505
14 513.9077323991248 19.017265254957856
15 506.819705650905 18.86367588447498
16 499.84570680638 18.711326949691706
17 492.9839014533387 18.560208432461877
18 486.23248469048724 18.410310395549168
19 479.5896806526991 18.261622981973602
20 473.0537420438921 18.11413641436317
21 466.62294967743304 17.967840994311054
22 460.2956120239282 17.822727101737755
23 454.07006476628715 17.678785194258502
24 447.9446703619517 17.536005806555817
25 441.917817612161

512 73.44963674260265 0.33521416732285186
513 73.44743445542697 0.33250687306967247
514 73.44526759732037 0.32982144376997063
515 73.44313559832138 0.3271577028355549
516 73.44103789763778 0.3245154751044182
517 73.43897394349895 0.3218945868292179
518 73.43694319301059 0.31929486566584225
519 73.43494511201261 0.3167161406620916
520 73.43297917493788 0.3141582422464254
521 73.43104486467429 0.3116210022168192
522 73.42914167242874 0.30910425372969796
523 73.42726909759338 0.30660783128896757
524 73.42542664761375 0.3041315707351322
525 73.42361383785925 0.3016753092344991
526 73.42183019149584 0.29923888526847325
527 73.42007523936041 0.2968221386229316
528 73.41834851983762 0.2944249103776914
529 73.4166495787381 0.29204704289606265
530 73.41497796917939 0.2896883798144732
531 73.41333325146803 0.2873487660321901
532 73.41171499298439 0.2850280477011295
533 73.41012276806809 0.2827260722157294
534 73.40855615790692 0.28044268820291707
535 73.40701475042611 0.2781777455121547
536 73.4

835 73.3134682458382 0.024423105060441493
836 73.31345655539495 0.024225856440244255
837 73.3134450530206 0.024030200861476808
838 73.31343373568953 0.023836125458236632
839 73.3134226004249 0.023643617468532906
840 73.31341164429756 0.02345266423344777
841 73.31340086442582 0.023263253196295672
842 73.31339025797426 0.02307537190180274
843 73.31337982215287 0.022889007995291588
844 73.31336955421675 0.02270414922186539
845 73.31335945146486 0.02252078342559617
846 73.31334951124003 0.02233889854873759
847 73.3133397309276 0.022158482630920056
848 73.31333010795477 0.0219795238083714
849 73.31332063979077 0.021802010313134768
850 73.31331132394469 0.02162593047229797
851 73.31330215796638 0.021451272707220935
852 73.31329313944487 0.021278025532771947
853 73.31328426600788 0.02110617755658555
854 73.31327553532135 0.020935717478296843
855 73.31326694508884 0.02076663408881055
856 73.31325849305081 0.020598916269555165
857 73.3132501769842 0.02043255299176142
858 73.31324199470127 0.020

1166 73.31274495061943 0.0016676522101236185
1167 73.31274489611395 0.0016541837303140517
1168 73.31274484248527 0.001640824026151211
1169 73.31274478971949 0.0016275722191305163
1170 73.31274473780243 0.0016144274378377458
1171 73.31274468672065 0.0016013888179032397
1172 73.3127446364606 0.0015884555019329183
1173 73.31274458700906 0.001575626639459366
1174 73.31274453835312 0.0015629013868828584
1175 73.31274449047999 0.0015502789074172496
1176 73.3127444433769 0.0015377583710372937
1177 73.31274439703164 0.0015253389544168984
1178 73.31274435143192 0.0015130198408815445
1179 73.31274430656588 0.0015008002203549798
1180 73.31274426242155 0.0014886792892981836
1181 73.31274421898738 0.0014766562506632766
1182 73.31274417625193 0.0014647303138425123
1183 73.31274413420401 0.001452900694610145
1184 73.3127440928325 0.0014411666150748023
1185 73.31274405212665 0.0014295273036276576
1186 73.31274401207547 0.0014179819948930098
1187 73.31274397266868 0.0014065299296773194
1188 73.31274393

<__main__.SimpleLinearRegression at 0xc9c4b70>

## Оценим качество модели.

In [116]:
mse_train_score = mse_score(df_train['target'], model.predict(df_train['CRIM']))
print('MSE on train:', mse_train_score)

MSE on train: 73.31274157452383


In [118]:
mse_test_score = mse_score(df_test['target'], model.predict(df_test['CRIM']))
print('MSE on test:', mse_test_score)

MSE on test: 74.28787061058355
