# Polynomial features using [PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) class

In [1]:
from sklearn import datasets

housing_data, housing_target = datasets.load_boston(return_X_y=True)

In [2]:
housing_data.shape

(506, 13)

In [3]:
from sklearn.model_selection import train_test_split

housig_train, housig_test, housig_train_t, housig_test_t  =  \
train_test_split(housing_data, housing_target, test_size=0.1, random_state=1010)

## Let's train regular regression as a benchmark

In [4]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(normalize=True)

In [5]:
lr.fit(housig_train, housig_train_t)

LinearRegression(normalize=True)

In [6]:
from sklearn.metrics import mean_squared_error

print("Mean squared error of a linear moderl: %.2f" % 
      mean_squared_error(housig_test_t, lr.predict(housig_test)))
score = lr.score(housig_test, housig_test_t) #r2_score
print("Linear Regression variance score: %.2f" % score)

Mean squared error of a linear moderl: 27.38
Linear Regression variance score: 0.78


## Let's generate polynomial features

In [21]:
from sklearn.preprocessing import PolynomialFeatures

pt = PolynomialFeatures(2, )#interaction_only=True)

In [22]:
housig_train_poly = pt.fit_transform(housig_train)
housig_train_poly.shape

(455, 105)

In [23]:
housig_test_poly = pt.fit_transform(housig_test)

In [24]:
lr_poly = LinearRegression(normalize=True)

In [25]:
lr_poly.fit(housig_train_poly, housig_train_t)

LinearRegression(normalize=True)

In [26]:
print("Mean squared error of a linear moderl using polynomial features: %.2f" % 
      mean_squared_error(housig_test_t, lr_poly.predict(housig_test_poly)))
score = lr_poly.score(housig_test_poly, housig_test_t) #r2_score
print("Linear Regression variance score using polynomial features: %.2f" % score)

Mean squared error of a linear moderl using polynomial features: 16.37
Linear Regression variance score using polynomial features: 0.87


In [27]:
print(lr_poly.coef_)

[ 1.16059880e-09 -4.74148732e-01 -3.15553193e-01 -5.37326030e+00
  4.25841986e+01  3.53059486e+01  1.03510808e+01  9.50292318e-01
 -7.16138496e+00  2.64220906e+00  4.06147473e-03  2.71772779e-01
  2.32889098e-03 -1.20249433e-01  3.10831955e-04  1.77046336e-01
  3.27921405e-01  2.17683478e+00 -1.91246029e+00  2.68869862e-01
 -6.53969582e-03 -1.15723006e-01  2.70113149e-01 -2.50143894e-02
  2.21354974e-01 -6.72914821e-04  3.29093386e-02 -1.01792014e-03
 -7.94703224e-03 -9.47848164e-02 -9.50066355e-01  3.30083299e-02
 -3.56753029e-04 -5.37367883e-03 -9.61885569e-04  6.38045674e-04
 -5.59541953e-03  1.50663627e-03 -1.89640538e-03  4.01106789e-02
 -6.91141135e-02  5.32165763e-01  2.72780977e-01  3.83194662e-03
  1.40056422e-01 -9.34806439e-03  1.89832052e-04 -8.73073846e-03
  5.35438997e-03 -1.11924864e-02  4.25841986e+01 -5.66072903e+01
 -5.87775113e+00  2.49522723e-03  6.13822365e-01 -5.88150687e-01
  4.92951161e-02 -1.66752139e+00  5.54928613e-03 -2.25489007e-01
 -9.94127154e-01  1.56571

In [None]:
# ToDo: plot how quality of solution (mean and score) change 
# with change in polynomial features. Plot mean and score for 5 different values of 
# polynominal degree

### Feature ranking with recursive feature elimination with [RFE](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html).

In [36]:
from sklearn.feature_selection import RFE

sel_ = RFE(estimator=LinearRegression(normalize=False), n_features_to_select=45)
sel_.fit(housig_train_poly, housig_train_t)

print(sel_.support_)
print(sel_.ranking_)

[False  True  True  True  True  True  True  True  True  True False  True
 False  True False  True  True  True  True  True False False False False
  True False False False False  True  True False False False False False
 False False False False  True  True False False  True  True False False
 False False  True  True  True False  True  True  True  True False  True
  True  True False  True  True False  True False  True  True False  True
  True False  True False  True False False False False False False False
  True  True False False False  True False False False False False False
 False False False False False False False False False]
[61  1  1  1  1  1  1  1  1  1 17  1 30  1 47  1  1  1  1  1 35  7 10 11
  1 45  6 46 37  1  1  8 41 36 54 49 38 44 32 12  1  1  4 50  1  1 51 26
 39 14  1  1  1 28  1  1  1  1 16  1  1  1 15  1  1  3  1 23  1  1 27  1
  1 21  1 31  1 56 52 19 53 34 43 29  1  1 48 18 24  1  2 25  9 40  5 58
 33 57 59 20 42 22 60 55 13]


In [37]:
print("Mean squared error of a linear moderl using _selected_ polynomial features: %.2f" % 
      mean_squared_error(housig_test_t, sel_.predict(housig_test_poly)))
score = sel_.score(housig_test_poly, housig_test_t) #r2_score
print("Linear Regression variance score using _selected_ polynomial features: %.2f" % score)

Mean squared error of a linear moderl using _selected_ polynomial features: 12.52
Linear Regression variance score using _selected_ polynomial features: 0.90


In [None]:
# ToDo: plot how quality of solution (mean and score) change 
# with change in number of polynomial features selected using recursive feature elimination.
# Generate a polynominal features with the degree of 3 and check quality of solution 
# (by polotting for mean and r2_score) for [5,10,15,20,25,30,35,40,45,50,55, 60] features

### Learning about importance of a feature with [Lasso regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) model. A Model with [regularization](https://en.wikipedia.org/wiki/Regularization_(mathematics)).

In [38]:
from sklearn.linear_model import Lasso

lasso_r = Lasso(alpha=0.5, max_iter=5000,)
lasso_r.fit(housig_train_poly, housig_train_t)

print("Mean squared error of a linear moderl: %.2f" % 
      mean_squared_error(housig_test_t, lr_poly.predict(housig_test_poly)))
score = lasso_r.score(housig_test_poly, housig_test_t) #r2_score
print("Lasso regression variance score: %.2f" % score)

Mean squared error of a linear moderl: 16.37
Lasso regression variance score: 0.88


  model = cd_fast.enet_coordinate_descent(


In [39]:
print(lasso_r.coef_)

[ 0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  3.88615946e-02  0.00000000e+00  2.06229009e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  4.43725430e-02
 -3.40012839e-03  0.00000000e+00 -3.72035278e-02  7.40859512e-04
  0.00000000e+00 -3.98293299e-05  7.03834244e-03 -1.06448097e-04
 -1.88043686e-03 -0.00000000e+00 -0.00000000e+00  5.68043044e-02
  1.09472622e-04 -6.37642978e-03 -1.99901470e-03  5.49338023e-04
 -3.50484965e-03 -9.96461120e-04 -4.01706213e-03  2.11751843e-02
  0.00000000e+00  0.00000000e+00  0.00000000e+00  6.93305396e-03
  0.00000000e+00  2.07639825e-02  4.24930501e-04 -6.01013889e-02
  4.41042958e-04 -2.44742128e-02 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
  4.64601190e-03  0.00000000e+00 -2.45021499e-04 -0.00000000e+00
  0.00000000e+00  0.00000

## Student task

In [None]:
# Check if applying polynomial features and RFC can inprove quality of solution for 
# regression for datasets.load_diabetes()