In [1]:
import pandas as pd
import sklearn.datasets as ds
import sklearn.linear_model
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

### na przykładzie pojedynczego problemu regresji

In [2]:
diabetes_ds = ds.load_diabetes()
X = pd.DataFrame(diabetes_ds.data, columns=diabetes_ds.feature_names)
y = pd.DataFrame(diabetes_ds.target, columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
y_test = y_test['target'].tolist()

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


przed wykonaniem regularyzacji Ridge, dane muszą zostać przeskalowane

In [4]:
X_train_nparray = np.asarray(X_train)
scaler = MinMaxScaler().fit(X_train_nparray)
X_train = scaler.transform(X_train_nparray)

X_test_nparray = np.asarray(X_test)
X_test = scaler.transform(X_test_nparray)

In [5]:
baseline = sklearn.linear_model.LinearRegression()
baseline.fit(X_train, y_train)
y_pred = baseline.predict(X_test)
print(y_test[:10])
print(y_pred.flatten().tolist()[:10])

[96.0, 118.0, 303.0, 88.0, 248.0, 53.0, 259.0, 77.0, 71.0, 225.0]
[57.66994164941771, 92.5669147845039, 276.0305611420969, 105.12401147272506, 225.63813908748676, 132.05303916858986, 159.91857089976958, 78.21910562919085, 85.36430816760998, 174.77803259668298]


regularyzatory dostępne w sklearn

- Ridge regression - regularyzacja dodana do funkcji straty

model Ridge z sklearn.linear_model to nałożenie na model LinearRegression regularyzacji Ridge (L2)

In [6]:
ridge = sklearn.linear_model.Ridge()
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(y_test[:10])
print(y_pred.flatten().tolist()[:10])

[96.0, 118.0, 303.0, 88.0, 248.0, 53.0, 259.0, 77.0, 71.0, 225.0]
[61.392232892853556, 99.67174525631674, 270.25021561119365, 104.81779872821033, 223.58760511326727, 130.6807129617208, 159.99923964752713, 81.95813085635828, 88.10218993224152, 170.55005546792955]


- Lasso regression - regularyzacja dodana do funkcji straty

model Lasso to nałożenie regularyzacji Lasso (L1)

In [7]:
lasso = sklearn.linear_model.Lasso()
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(y_test[:10])
print(y_pred[:10])

[96.0, 118.0, 303.0, 88.0, 248.0, 53.0, 259.0, 77.0, 71.0, 225.0]
[ 75.4009677  108.999099   250.21134843 114.19457643 213.2431769
 134.22980129 164.33825036  85.41521422  92.33061957 165.86335611]


- Elastic Net Regression

model ElasticNet to nałożenie regularyzacji L1 i L2

In [8]:
elastic_net = sklearn.linear_model.ElasticNet(l1_ratio=0.5)
elastic_net.fit(X_train, y_train)
y_pred = elastic_net.predict(X_test)
print(y_test[:10])
print(y_pred[:10])

[96.0, 118.0, 303.0, 88.0, 248.0, 53.0, 259.0, 77.0, 71.0, 225.0]
[139.13497884 150.39187672 167.21055464 148.66127031 169.04126661
 151.43837431 159.13612499 141.54313437 141.70321622 156.05494383]


### Early Stopping

Modele SGDRegressor i SGDClassifier posiadają jako jeden z parametrów early_stopping. Poniżej dla przykładu regresji:

- bez early stopping

In [9]:
sgd = sklearn.linear_model.SGDRegressor()
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
y_pred = y_pred.flatten().tolist()
print(y_test[:10])
print(y_pred[:10])
print('MSE:', mean_squared_error(y_test, y_pred))

[96.0, 118.0, 303.0, 88.0, 248.0, 53.0, 259.0, 77.0, 71.0, 225.0]
[59.95842096020769, 105.16754207277283, 267.93439171631013, 102.7044788382104, 229.00281785452398, 127.01735067431977, 162.21729140861794, 80.84332388849322, 84.20515996004775, 167.26194504669343]
MSE: 3328.6432659264533


  y = column_or_1d(y, warn=True)


- z early stopping

In [10]:
sgd = sklearn.linear_model.SGDRegressor(early_stopping=True)
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
y_pred = y_pred.flatten().tolist()
print(y_test[:10])
print(y_pred[:10])
print('MSE:', mean_squared_error(y_test, y_pred))

[96.0, 118.0, 303.0, 88.0, 248.0, 53.0, 259.0, 77.0, 71.0, 225.0]
[103.06537666494708, 149.12842845096367, 203.3680787213623, 136.85032696062112, 211.31317787251533, 146.86345388542676, 176.4090042157175, 105.62404967956331, 103.4343266315824, 161.80055250386815]
MSE: 3934.5689143991117


  y = column_or_1d(y, warn=True)
