In [68]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pickle

## DataSet

In [3]:
boston = load_boston()

In [4]:
type(boston)

sklearn.utils.Bunch

In [5]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [6]:
type(boston['data'])

numpy.ndarray

## Train/test split

In [96]:
clf = RandomForestRegressor() ## model

In [10]:
clf.fit(boston['data'],boston['target'])



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [12]:
clf.score(boston['data'],boston['target'])

0.9783430072525737

In [14]:
clf.score?

In [15]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_make_estimator',
 '_more_tags',
 '_required_parameters',
 '_set_oob_score',
 '_validate_X_predict',
 '_validate_estimator',
 '_validate_y_class_weight',
 'apply',
 'base_estimator',
 'base_estimator_',
 'bootstrap',
 'class_weight',
 'criterion',
 'decision_path',
 'estimator_params',
 'estimators_',
 'feature_importances_',
 'fit',
 'get_params',
 'max_depth',
 'max_features',
 'max_leaf_nodes',
 'min_impurity_decrease',
 'min_impurity_split',
 'min_samp

In [18]:
clf.n_features_

13

In [19]:
row = boston['data'][17]

In [20]:
row.reshape(-1, 13)

array([[  0.7842,   0.    ,   8.14  ,   0.    ,   0.538 ,   5.99  ,
         81.7   ,   4.2579,   4.    , 307.    ,  21.    , 386.75  ,
         14.67  ]])

In [21]:
clf.predict(row.reshape(-1, 13))

array([17.97])

In [22]:
boston['target'][17]

17.5

In [26]:
xtrain, xtest,ytrain,ytest = train_test_split(boston['data'],boston['target'],test_size=0.3) 

In [30]:
clf = RandomForestRegressor()


In [31]:
clf.fit(xtrain, ytrain)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [32]:
clf.score(xtest, ytest)

0.9122636891583634

In [97]:
df = pd.DataFrame(boston['data'], columns=boston['feature_names'])

In [98]:
clf = SVR()

In [99]:
clf.fit(xtrain, ytrain)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [100]:
clf.score(xtest, ytest)

-0.00037692349825180216

In [101]:
xs = preprocessing.scale(boston['data'])

## Preprocessing

In [102]:
df = pd.DataFrame(xs, columns=boston['feature_names'])

In [103]:
xtrain, xtest,ytrain,ytest = train_test_split(boston['data'],boston['target'],test_size=0.3) 

In [104]:
clf = SVR()

In [105]:
clf.fit(xtrain,ytrain)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [106]:
clf.score(xtest, ytest)

-0.012359888244140471

#### Dimentional Reduction

In [107]:
pca = PCA(n_components=5)

In [108]:
pca.fit(boston['data'])

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

##  PipeLines

In [109]:
pipe =Pipeline([
    ('scale',StandardScaler()),
    ('pca',PCA(n_components=5)),
    ('svr',SVR())
])

In [110]:
pipe.fit(xtrain, ytrain)



Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=5,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('svr',
                 SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
                     gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                     shrinking=True, tol=0.001, verbose=False))],
         verbose=False)

In [111]:
pipe.score(xtest, ytest)

0.503215994689941

In [112]:
pipe.steps

[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('pca',
  PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
      svd_solver='auto', tol=0.0, whiten=False)),
 ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
      gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
      tol=0.001, verbose=False))]

In [113]:
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('pca',
   PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
       svd_solver='auto', tol=0.0, whiten=False)),
  ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
       gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
       tol=0.001, verbose=False))],
 'verbose': False,
 'scale': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pca': PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False),
 'svr': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
     gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
     tol=0.001, verbose=False),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'pca__copy': True,
 'pca__iterated_power': 'auto',
 'pca__n_components': 5,
 'pca__random_state': None,
 'pca__svd_sol

In [114]:
pipe.set_params(svr__C=0.9)

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=5,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('svr',
                 SVR(C=0.9, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
                     gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                     shrinking=True, tol=0.001, verbose=False))],
         verbose=False)

## save and load

In [115]:
with open('model.pickle','wb') as out:
          pickle.dump(pipe, out)

In [116]:
with open('model.pickle', 'rb') as fp:
    pipe2 = pickle.load(fp)