In [1]:
# import libraries
import sklearn
# load dataset
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
# data preprcessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
# estimator
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import k_means
# metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
# data manupulation
import pandas as pd
import numpy as np
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load data

from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

boston_data = load_boston()
boston_data.keys(),load_boston()

(dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module']),
 {'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
          4.9800e+00],
         [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
          9.1400e+00],
         [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
          4.0300e+00],
         ...,
         [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
          5.6400e+00],
         [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
          6.4800e+00],
         [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
          7.8800e+00]]),
  'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
         18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
         15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
         13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3,

In [3]:
# define feature and target
X = boston_data['data']        # Feature
y = boston_data['target']      # Label

In [4]:
# print description
print(boston_data['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
boston = pd.DataFrame(boston_data['data'], columns=boston_data['feature_names'])
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [6]:
boston.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CRIM,506.0,3.613524,8.601545,0.00632,0.082045,0.25651,3.677083,88.9762
ZN,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
INDUS,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
CHAS,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
NOX,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
RM,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
AGE,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
DIS,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
RAD,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0
TAX,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


In [7]:
# estimator (fit & predict): transformer, regressor, classifier
# feature scaling: 1) min-max scaler (x-min)/(min-max)  2) standardize sclaer 3) robust scaler 4) normalizer (L1/L2/max)

# transformer
from sklearn.preprocessing import StandardScaler
# create and fit scaler
scaler = StandardScaler()
# train and fit the object to the feature matrix X
scaler.fit(X)
# scale data set
Xt = scaler.transform(X)
# create data frame with results
stats = np.vstack((X.mean(axis=0), X.var(axis=0), Xt.mean(axis=0), Xt.var(axis=0))).T
feature_names = boston_data['feature_names']
columns = ['unscaled mean','unscaled variance','scaled mean','scaled variance']

df = pd.DataFrame(stats, index=feature_names, columns=columns)
df

Unnamed: 0,unscaled mean,unscaled variance,scaled mean,scaled variance
CRIM,3.613524,73.84036,-8.787437000000001e-17,1.0
ZN,11.363636,542.86184,-6.343191e-16,1.0
INDUS,11.136779,46.97143,-2.682911e-15,1.0
CHAS,0.06917,0.064385,4.701992e-16,1.0
NOX,0.554695,0.013401,2.490322e-15,1.0
RM,6.284634,0.492695,-1.14523e-14,1.0
AGE,68.574901,790.792473,-1.407855e-15,1.0
DIS,3.795043,4.425252,9.210902e-16,1.0
RAD,9.549407,75.666531,5.441409e-16,1.0
TAX,408.237154,28348.6236,-8.868619e-16,1.0


In [8]:
Xs = np.array([[-500.5],[-100.01],[0],[100.1],[900.9]])
Xs
# min-max scaler
minmax_scale = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))
# scale or transform the feature
scaled_X = minmax_scale.fit_transform(Xs)
scaled_X

array([[0.        ],
       [0.28577851],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [9]:
# standardize scaler
scaler = sklearn.preprocessing.StandardScaler()
# transform the feature
scaled_X = scaler.fit_transform(Xs)
# mean and standard deviation of scaled feature
scaled_X, scaled_X.mean(), scaled_X.std()

(array([[-1.26692972],
        [-0.39301578],
        [-0.17478279],
        [ 0.0436466 ],
        [ 1.79108169]]),
 4.4408920985006264e-17,
 1.0)

In [10]:
# robust scaler
Xs = np.array([[-1000.1],[-200.2],[500.5],[9000.9]])
robust_scaler = sklearn.preprocessing.RobustScaler()
# transform feature
robust_scaler.fit_transform(Xs)

array([[-0.38015054],
       [-0.11578852],
       [ 0.11578852],
       [ 2.92511836]])

In [11]:
# l1 norm
Xs = np.array([[0.5, 0.5],[1.1, 3.4],[1.5, 20.2],[1.63, 34.4],[10.9, 3.3]]) 
sklearn.preprocessing.Normalizer(norm='l1').fit_transform(Xs)

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [12]:
sklearn.preprocessing.Normalizer(norm='l2').fit_transform(Xs)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [13]:
sklearn.preprocessing.Normalizer(norm='max').fit_transform(Xs)

array([[1.        , 1.        ],
       [0.32352941, 1.        ],
       [0.07425743, 1.        ],
       [0.04738372, 1.        ],
       [1.        , 0.30275229]])

In [14]:
# predictors

from sklearn.linear_model import LinearRegression

# create model and train/fit
model = LinearRegression()
# training
model.fit(X, y)
# predicting
y_pred = model.predict(X)

print(y_pred[:10])
print(f'R^2: {model.score(X, y):0.4}')

[30.00384338 25.02556238 30.56759672 28.60703649 27.94352423 25.25628446
 23.00180827 19.53598843 11.52363685 18.92026211]
R^2: 0.7406


In [15]:
from sklearn.ensemble import GradientBoostingRegressor

# create model and train/fit
model = GradientBoostingRegressor()
# training
model.fit(X, y)
# predicting
y_pred = model.predict(X)

print(y_pred[:10])
print(f'R^2: {model.score(X, y):0.4}')

[25.90772604 21.96320179 33.92712155 34.14528061 35.41267912 26.7925396
 21.48031031 20.87839556 16.95411564 18.45898255]
R^2: 0.9761


In [16]:
# pipelines (sequential)

from sklearn.pipeline import Pipeline
pipe.named_steps

NameError: name 'pipe' is not defined

In [None]:
# list of tuples
pipe = Pipeline([
                 ('scaler', StandardScaler()),
                 ('regressor', LinearRegression())
                ])
# fit/train model
pipe.fit(X, y)
# predict the labels
y_pred = pipe.predict(X)


print(y_pred[:10])
print(f'R^2: {pipe.score(X, y):0.4}')

In [None]:
# feature union

# import
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import f_regression, SelectKBest
# pipeline
pca_pipe = Pipeline([('scaler', StandardScaler()), ('dim_red', PCA(n_components=4))])
union = FeatureUnion([('pca_pipe', pca_pipe), ('selector', SelectKBest(f_regression, k=2))])
# predictor
pipe = Pipeline([('union', union), ('regressor', LinearRegression())]) 
pipe.fit(X, y)
# print result
print(f'Number of features in the original dataset:{X.shape[-1]}')
print(f'Number of features in the new dataset: {union.transform(X).shape[-1]}')
print(f'R-square: {pipe.score(X, y):0.4}')