# Линейная регрессия

Подключение необходимых библиотек

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Определение функции, описывающей линейную зависимость

In [None]:
def linEquation(w,x,b):
    return w * x + b

Объявление переменных:


* numSample = количество объектов в выборке
* noisePower = уровень шума
* linearCoef = значение веса w
* linearBias = значение свободного члена b



In [None]:
numSamples = 500
noisePower = 7
linearCoef = 5
linearBias = 3

Генерация датасета

In [None]:
X = np.linspace(-5, 5, numSamples)
Y = linEquation(linearCoef, X, linearBias) + np.random.randn(numSamples) * noisePower
X = X.reshape(-1,1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8)

Визуализация сгенерированного датасета

In [None]:
plt.figure(figsize=(10, 7))
#plt.plot(X, linEquation(linearCoef, X, linearBias), label='Regression line', c='black')
plt.scatter(X_train, Y_train, label='Train samples', c='blue')
plt.scatter(X_test, Y_test, label='Test samples', c='orange')
plt.title("Generated dataset")
plt.grid(alpha=0.2)
plt.legend()
plt.show()

Стандартизация данных

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Обучение модели и получение прогнозов на тестовой выборке


In [None]:
# linear regression by OLS method
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
#print(Y_pred)

Отображение коэффициентов, полученных моделью

In [None]:
print('Estimated weight is', np.round(model.coef_,2), 'true weight is', linearCoef)
print('Estimated bias is', np.round(model.intercept_,2), 'true bias is', linearBias)

Оценка метрик на тестовом датасете

In [None]:
MSE_test = metrics.mean_squared_error(Y_test, Y_pred)
MSE_r2_test = metrics.r2_score(Y_test, Y_pred)
print('Mean squared error:', MSE_test)
print('R2 score:', MSE_r2_test)

Визуализация оценок обученной модели

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(X, linEquation(linearCoef, X, linearBias), label='real', c='g')
plt.scatter(X_train, Y_train, label='train')
plt.scatter(X_test, Y_test, label='test')
plt.plot(X_train, model.predict(X_train), label='Train', c='r', linestyle=':')
plt.plot(X_test, Y_pred, label='Test', c='black', linestyle=':')
plt.title("Regression line on train and test set")
plt.ylabel('target')
plt.xlabel('feature')
plt.grid(alpha=0.2)
plt.legend()
plt.show()

Сгенерируем датасет с несколькими признаками

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

Переопределим переменные

In [None]:
linearCoef = 6.66
linearBias = 1.5

Создадим один информативный признак X1 <br>
Признаки X2 - X5 - линейные комбинации исходного признака X1 <br>
Признак X6 - белый шум

In [None]:
np.random.seed(42)
numSamples = 500
X1 = np.linspace(-10, 10, numSamples) + np.random.randn(numSamples)
X2 = 2.2 * X1 + np.random.randn(numSamples) * 5.5
X3 = 3.3 * X1 + np.random.randn(numSamples) * 4.4
X4 = 4.4 * X1 + np.random.randn(numSamples) * 3.3
X5 = 5.5 * X1 + np.random.randn(numSamples) * 2.2
X6 = np.random.randn(numSamples) * 20
Y = linEquation(linearCoef, X1, linearBias) + np.random.randn(numSamples) * 1.1
X = np.column_stack((X1, X2, X3, X4, X5, X6))
print('Design matrix size is', X.shape)

Построение матрицы корреляции

In [None]:
correlation_matrix  = np.corrcoef(X, rowvar=False)
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.title('Correlation Matrix Heatmap')
plt.xticks(range(6), ['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
plt.yticks(range(6), ['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
plt.show()

Обучение и оценка модели

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8)
# features z-transform
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# model definition
modelOLS = LinearRegression()
modelOLS.fit(X_train, Y_train)
Y_pred = modelOLS.predict(X_test)
MSE_test = metrics.mean_squared_error(Y_test, Y_pred)
MSE_r2_test = metrics.r2_score(Y_test, Y_pred)
print('Mean squared error:', MSE_test)
print('R2 score:', MSE_r2_test)

Вывод коэффициентов модели

In [None]:
print('Estimated weights is', modelOLS.coef_)
print('Estimated bias is', modelOLS.intercept_)

In [None]:
modelSGD = SGDRegressor(loss='squared_error', penalty = 'l2', max_iter = 10,
                             tol = None, eta0 = 0.01, learning_rate = 'constant',
                             random_state = 42)

loss - The loss function to be used <br>
penalty - regularization term <br>
tol - the stopping criterion <br>
eta0 - initial learning rate <br>
learning_rate - LR schedule

In [None]:
modelSGD.fit(X_train_scaled, Y_train)

In [None]:
Y_pred_sgd = modelSGD.predict(X_train_scaled)
MSE_test = metrics.mean_squared_error(Y_train, Y_pred_sgd)
MSE_r2_test = metrics.r2_score(Y_train, Y_pred_sgd)
print('Mean squared error:', MSE_test)
print('R2 score:', MSE_r2_test)

In [None]:
Y_pred_sgd = modelSGD.predict(X_test_scaled)
MSE_test = metrics.mean_squared_error(Y_test, Y_pred_sgd)
MSE_r2_test = metrics.r2_score(Y_test, Y_pred_sgd)
print('Mean squared error:', MSE_test)
print('R2 score:', MSE_r2_test)

In [None]:
SGD_pred = modelSGD.predict(X_test_scaled)
OLS_pred = modelOLS.predict(X_test)

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(X_test_scaled[:,0], Y_test, label ='Scaled sample distribution')
plt.scatter(X_test[:,0], Y_test, label ='Unscaled sample distribution')
plt.plot(X_test_scaled[:,0], SGD_pred, label='SGD', c='red', linestyle='solid')
plt.plot(X_test[:,0], OLS_pred, label='OLS', c='black', linestyle='solid')
plt.title("OLS vs SGD")
plt.ylabel('target')
plt.xlabel('feature')
plt.grid(alpha = 0.2)
plt.legend()
plt.show()