<div style="font-size:18pt; padding-top:20px; text-align:center">СЕМИНАР. <b>Полиномиальная регрессия</b></div><hr>
<div style="text-align:right;">Папулин С.Ю. <span style="font-style: italic;font-weight: bold;">(papulin.study@yandex.ru)</span></div>

<a name="0"></a>
<div><span style="font-size:14pt; font-weight:bold">Содержание</span>
    <ol>
        <li><a href="#1">Линейная регрессия</a></li>
        <li><a href="#2">Полиномиальная регрессия</a>
        <li><a href="#3">Источники</a>
        </li>
    </ol>
</div>

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">1. Линейная регрессия</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

### Генерация данных

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
n = 100
x = stats.uniform.rvs(size=n, loc=4, scale=8)
x[:5]

Функция гипотезы:

In [None]:
y_h = 2 + 0.3*x
y_h[:5]

In [None]:
plt.title("Initial data")
plt.plot(x, y_h, "o")
plt.grid(True)
plt.show()

Выборка

In [None]:
mu = 0
sigma = 0.5
y = stats.norm.rvs(size=n, loc=mu, scale=sigma) + y_h
y[:5]

In [None]:
plt.title("Initial data")
plt.plot(x, y, "o")
plt.xlabel("X")
plt.ylabel("Y")
plt.grid(True)
plt.show()

<p><b>Формирование обучающего и тестового подмножеств</b></p>

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x.reshape(-1,1), y, test_size=0.3, random_state=10)
x_train[:5], y_train[:5]

<p><b>Обучение</b></p>

In [None]:
lr_model = LinearRegression()

In [None]:
lr_model = lr_model.fit(x_train, y_train)
lr_model

In [None]:
print("Коэффициенты (наклон): ", lr_model.coef_)
print("Пересечение: ", lr_model.intercept_)

In [None]:
f_pred = lambda x : x * lr_model.coef_ + lr_model.intercept_

#### Ошибка на обучающем подмножестве

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

Среднеквадратическая ошибка

In [None]:
mse_train = mean_squared_error(y_train, f_pred(x_train))
mse_train

Коэффициент детерминации (R^2)

In [None]:
r2_train = r2_score(y_train, f_pred(x_train))
r2_train

График

In [None]:
xx = np.linspace(0,5,100)

plt.figure(1, figsize=[6, 4])

plt.subplot(1,1,1)
plt.title("Train data")
plt.plot(x_train, y_train, "o")
plt.plot(x_train, f_pred(x_train),color="red", lw=2)
plt.vlines(x_train, ymin=y_train, ymax=f_pred(x_train), colors="black", linestyles="dotted", lw=1)
plt.xlabel("X")
plt.ylabel("Y")
plt.grid(True)

<p><b>Проверка на тестовом подмножестве</b></p>

In [None]:
y_pred_test = lr_model.predict(x_test)

In [None]:
mse_test = mean_squared_error(y_test, f_pred(x_test))
mse_test

In [None]:
r2_test = r2_score(y_test, f_pred(x_test))
r2_test

In [None]:
lr_model.score(x_test, y_test)

In [None]:
plt.figure(1, figsize=[6, 4])

plt.subplot(1,1,1)
plt.title("Test data")
plt.plot(x_test, y_test, "o")
plt.plot(x_test, f_pred(x_test), color="red", lw=2)
plt.vlines(x_test, ymin=y_test, ymax=f_pred(x_test), colors="black", linestyles="dotted", lw=1)
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")

plt.show()

<p><b>Графики</b></p>

In [None]:
plt.figure(1, figsize=[12, 4])

plt.subplot(1,2,1)
plt.title("Train data")
plt.plot(x_train, y_train, "o")
plt.plot(x_train, f_pred(x_train),color="red", lw=2)
plt.vlines(x_train, ymin=y_train, ymax=f_pred(x_train), colors="black", linestyles="dotted", lw=1)
plt.xlabel("X")
plt.ylabel("Y")
plt.grid(True)


plt.subplot(1,2,2)
plt.title("Test data")
plt.plot(x_test, y_test, "o")
plt.plot(x_test, f_pred(x_test), color="red", lw=2)
plt.vlines(x_test, ymin=y_test, ymax=f_pred(x_test), colors="black", linestyles="dotted", lw=1)
plt.xlabel("X")
plt.ylabel("Y")
plt.grid(True)


plt.show()

### Что если зависимость имеет следующий вид

In [None]:
n = 100
x = stats.uniform.rvs(size=n, loc=0, scale=5, random_state=10)
y = stats.norm.rvs(size=n, loc=0, scale=0.2, random_state=10) + np.sin(x)

In [None]:
plt.plot(x, y, "o")
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

<p>Разделение исходных данных на обучающее и тестовое подмножества</p>

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x.reshape(-1,1), y, test_size=0.3, random_state=1234)
x_train[:5], y_train[:5]

<p>Обучение</p>

In [None]:
lr_model = LinearRegression()
lr_model = lr_model.fit(x_train, y_train)

In [None]:
f_pred = lambda x : x * lr_model.coef_ + lr_model.intercept_

<p>Тестирование</p>

In [None]:
mse_train = mean_squared_error(y_train, f_pred(x_train).flatten())
mse_train

In [None]:
r2_train = lr_model.score(x_train, y_train)
r2_train

In [None]:
mse_test = mean_squared_error(y_test, f_pred(x_test))
mse_test

In [None]:
r2_test = lr_model.score(x_test, y_test)
r2_test

<p>Графики</p>

In [None]:
plt.figure(1, figsize=[12, 4])

plt.subplot(1,2,1)
plt.title("Train data")
plt.plot(x_train, y_train, "o")
plt.plot(x_train, lr_model.predict(x_train), color="red", lw=2)
plt.plot(x_train, lr_model.predict(x_train), "o", color="red", lw=2)
plt.vlines(x_train, ymin=y_train, ymax=lr_model.predict(x_train), colors="black", linestyles="dotted", lw=1)
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")

plt.subplot(1,2,2)
plt.title("Test data")
plt.plot(x_test, y_test, "o")
plt.plot(x_test, lr_model.predict(x_test), color="red", lw=2)
plt.plot(x_test, lr_model.predict(x_test), "o", color="red", lw=2)
plt.vlines(x_test, ymin=y_test, ymax=lr_model.predict(x_test), colors="black", linestyles="dotted", lw=1)
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")

plt.show()

<a name="2"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">2. Полиномиальная регрессия</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from numpy.polynomial.polynomial import polyval

In [None]:
n = 100
x = stats.uniform.rvs(size=n, loc=0, scale=5, random_state=10)
y = stats.norm.rvs(size=n, loc=0, scale=0.2, random_state=10) + np.sin(x)

In [None]:
plt.plot(x, y, "o")
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

<p><b>Формирование обучающего и тестового подмножеств</b></p>

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x.reshape(-1,1), y, test_size=0.3, random_state=1234)
x_train[:5], y_train[:5]

<p><b>Обучение</b></p>

In [None]:
pf = PolynomialFeatures(degree=3)
x_train_poly = pf.fit_transform(x_train)

In [None]:
lr = LinearRegression(fit_intercept=True)
lr = lr.fit(x_train_poly, y_train)

In [None]:
print("Коэффициенты: ", lr.coef_)
print("Пересечение: ", lr.intercept_)

<p>или можно использовать Pipeline</p>

In [None]:
pipeline = Pipeline([("plF", PolynomialFeatures(degree=3)), ("lr", LinearRegression(fit_intercept=True))])

In [None]:
pipeline = pipeline.fit(x_train, y_train)

In [None]:
print("Коэффициенты: ", pipeline.named_steps["lr"].coef_)
print("Пересечение: ", pipeline.named_steps["lr"].intercept_)

In [None]:
f_pred = lambda x : polyval(x, pipeline.named_steps["lr"].coef_) + pipeline.named_steps["lr"].intercept_

Ошибки на обучающем подмножестве

In [None]:
mse_train = mean_squared_error(y_train, f_pred(x_train).flatten())
mse_train

In [None]:
r2_train = pipeline.score(x_train, y_train)
r2_train

In [None]:
xx = np.linspace(0,5,100)

plt.title("Train data")
plt.plot(x_train, y_train, "o")
plt.plot(xx, f_pred(xx), color="red", lw=2)
plt.plot(x_train, f_pred(x_train), "o", color="red", lw=2)
plt.vlines(x_train, ymin=y_train, ymax=f_pred(x_train), colors="black", linestyles="dotted", lw=1)
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

<p><b>Проверка на тестовом подмножестве</b></p>

In [None]:
y_pred_test = pipeline.predict(x_test)

In [None]:
mse_train = mean_squared_error(y_test, y_pred_test)
mse_train

In [None]:
pipeline.score(x_test, y_test)

In [None]:
plt.title("Test data")
plt.plot(x_test, y_test, "o")
plt.plot(xx, f_pred(xx), color="red", lw=2)
plt.plot(x_test, f_pred(x_test), "o", color="red", lw=2)
plt.vlines(x_test, ymin=y_test, ymax=f_pred(x_test), colors="black", linestyles="dotted", lw=1)
plt.grid(True)
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

<p><b>Графики</b></p>

In [None]:
plt.figure(1, figsize=[12, 4])

plt.subplot(1,2,1)
plt.title("Train data")
plt.plot(x_train, y_train, "o")
plt.plot(xx, f_pred(xx), color="red", lw=2)
plt.plot(x_train, f_pred(x_train), "o", color="red", lw=2)
plt.vlines(x_train, ymin=y_train, ymax=f_pred(x_train), colors="black", linestyles="dotted", lw=1)
plt.xlabel("X")
plt.ylabel("Y")
plt.grid(True)

plt.subplot(1,2,2)
plt.title("Test data")
plt.plot(x_test, y_test, "o")
plt.plot(xx, f_pred(xx), color="red", lw=2)
plt.plot(x_test, f_pred(x_test), "o", color="red", lw=2)
plt.vlines(x_test, ymin=y_test, ymax=f_pred(x_test), colors="black", linestyles="dotted", lw=1)
plt.xlabel("X")
plt.ylabel("Y")
plt.grid(True)

plt.show()

<a name="4"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">4. Источники</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html