In [59]:
from sklearn.datasets import load_boston

boston_data = load_boston()
print(boston_data['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [60]:
import pandas as pd

In [61]:
X = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
y = boston_data.target

In [62]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
X_train.index

Int64Index([ 33, 283, 418, 502, 402, 368, 201, 310, 343, 230,
            ...
            228,   8,  73, 400, 118, 486, 189, 495, 206, 355],
           dtype='int64', length=404)

In [64]:
X_train.sample(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
254,0.04819,80.0,3.64,0.0,0.392,6.108,32.0,9.2203,1.0,315.0,16.4,392.89,6.57
448,9.32909,0.0,18.1,0.0,0.713,6.185,98.7,2.2616,24.0,666.0,20.2,396.9,18.13
343,0.02543,55.0,3.78,0.0,0.484,6.696,56.4,5.7321,5.0,370.0,17.6,396.9,7.18
206,0.22969,0.0,10.59,0.0,0.489,6.326,52.5,4.3549,4.0,277.0,18.6,394.87,10.97
336,0.03427,0.0,5.19,0.0,0.515,5.869,46.3,5.2311,5.0,224.0,20.2,396.9,9.8
57,0.01432,100.0,1.32,0.0,0.411,6.816,40.5,8.3248,5.0,256.0,15.1,392.9,3.95
341,0.01301,35.0,1.52,0.0,0.442,7.241,49.3,7.0379,1.0,284.0,15.5,394.74,5.49
313,0.26938,0.0,9.9,0.0,0.544,6.266,82.8,3.2628,4.0,304.0,18.4,393.39,7.9
493,0.17331,0.0,9.69,0.0,0.585,5.707,54.0,2.3817,6.0,391.0,19.2,396.9,12.01
30,1.13081,0.0,8.14,0.0,0.538,5.713,94.1,4.233,4.0,307.0,21.0,360.17,22.6


In [65]:
y_train

array([13.1, 50. ,  8.8, 20.6, 12.1, 50. , 24.1, 16.1, 23.9, 24.3, 13.1,
       30.3, 15.2, 13.8, 26.4, 16.6, 18.9, 17.6, 18.7, 33.4, 20.7, 17.1,
       23.4, 26.5, 21.4, 21.5, 19.2, 50. , 50. , 23. , 10.5, 17.8, 10.9,
       21. , 13.8, 10.5, 22.2, 30.5, 19.4, 15.6, 20.2, 19.3, 34.6, 50. ,
       24. , 18.7, 19.8, 22.5, 13.3, 50. , 11.8, 11. , 23.7, 35.4, 15.2,
       24.4, 33.4, 31.6, 13.4, 34.9, 14.4, 35.4, 25.3, 18.3, 16.6, 13.4,
       23.6, 27.5, 22.2, 17.7, 14.3, 21.7,  8.4, 15.3, 20.3, 32. , 20. ,
       19.1, 28.7, 46. , 22.6, 23.9, 21.9, 15.6, 50. , 25. , 37.9, 21.6,
       19.3, 17.5, 22.9, 15. , 27.5, 10.2, 23.8, 23.9, 20.1, 16.5, 33.1,
       14.6, 28.4, 23.7, 12.3, 31.5, 22. , 12.5, 35.1, 14.9, 22.9, 22.9,
       19.3, 19.8, 20. , 29.6, 20.5, 29. , 20.7, 19.9, 11.9,  5. , 23.3,
       20.6, 22.9, 19.6, 14.1, 30.8, 43.1, 19.9, 13.9, 22.3, 14.3, 23.9,
       16. , 20.5, 10.2, 20.1, 12.8, 18.9, 22. , 20.4, 17.5, 13.1, 22. ,
       45.4, 18.8, 20. , 20.1, 21.4, 17.4, 21.1, 28

In [66]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
import numpy as np

lr_model = LinearRegression()

In [67]:
lr_model.fit(X_train, y_train)

LinearRegression()

In [68]:
coefs = lr_model.coef_
coefs

array([-1.30799852e-01,  4.94030235e-02,  1.09535045e-03,  2.70536624e+00,
       -1.59570504e+01,  3.41397332e+00,  1.11887670e-03, -1.49308124e+00,
        3.64422378e-01, -1.31718155e-02, -9.52369666e-01,  1.17492092e-02,
       -5.94076089e-01])

In [69]:
len(coefs)

13

In [70]:
# dir(lr_model)

In [71]:
intercept = lr_model.intercept_
intercept

37.91248700975062

In [72]:
pred_train = lr_model.predict(X_train)

In [73]:
pred_test = lr_model.predict(X_test)

In [74]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"Средняя квадратичная ошибка (MSE) модели на обучающей выборке {mean_squared_error(y_train, pred_train)}")
print(f"Средняя квадратичная ошибка (MSE) модели на тестовой выборке {mean_squared_error(y_test, pred_test)}")
print()
print(f"Коэффициент детерминации (R^2) модели на обучающей выборке {r2_score(y_train, pred_train)}")
print(f"Коэффициент детерминации (R^2) модели на тестовой выборке {r2_score(y_test, pred_test)}")

Средняя квадратичная ошибка (MSE) модели на обучающей выборке 22.477090408387628
Средняя квадратичная ошибка (MSE) модели на тестовой выборке 20.869292183770682

Коэффициент детерминации (R^2) модели на обучающей выборке 0.738339392059052
Коэффициент детерминации (R^2) модели на тестовой выборке 0.7334492147453092


In [88]:
#https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

In [89]:
new_model = Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', LinearRegression(fit_intercept=False))])

In [90]:
new_model = new_model.fit(X_train, y_train)
# new_model.named_steps['linear'].coef_

In [91]:
pred_train = new_model.predict(X_train)

In [92]:
pred_test = new_model.predict(X_test)

In [93]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"Средняя квадратичная ошибка (MSE) модели на обучающей выборке {mean_squared_error(y_train, pred_train)}")
print(f"Средняя квадратичная ошибка (MSE) модели на тестовой выборке {mean_squared_error(y_test, pred_test)}")
print()
print(f"Коэффициент детерминации (R^2) модели на обучающей выборке {r2_score(y_train, pred_train)}")
print(f"Коэффициент детерминации (R^2) модели на тестовой выборке {r2_score(y_test, pred_test)}")

Средняя квадратичная ошибка (MSE) модели на обучающей выборке 5.879378436747233
Средняя квадратичная ошибка (MSE) модели на тестовой выборке 10.217789024814902

Коэффициент детерминации (R^2) модели на обучающей выборке 0.9315569004651907
Коэффициент детерминации (R^2) модели на тестовой выборке 0.8694943908903058
