Let's start with the import of packages we are going to need

In [1]:
import pandas as pd
from sklearn.datasets import load_wine

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

from sklearn.ensemble import RandomForestClassifier

import pickle

In [2]:
# load dataset

data = load_wine()
df = pd.DataFrame(data['data'])
df.columns = data['feature_names']
y = data['target']
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


During the model creation, we will work on following tasks:

* Filter own columns for PCA
* Scaling
* PCA
* SelectKBest
* Random Forest Regressor

and put them all to one pipeline.

### Filter Own Columns
Firstly, we will create our own class to keep only features we want in our pipeline. We don't want to run PCA on all features but only on the sample so we create own class that filters the features in the original dataframe. We can put our own classes into the pipelines, as long as they have following methods:

* .fit()
* .transform()
* .fit_transform()

In [3]:
# own class that can be inserted to pipeline as any other sklearn object.
class RawFeats:
    def __init__(self, feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass


    def transform(self, X, y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)


# features we want to keep for PCA
feats = ['alcohol','malic_acid','ash','alcalinity_of_ash','magnesium',
         'total_phenols','flavanoids','nonflavanoid_phenols']
# creating class object with indexes we want to keep.
raw_feats = RawFeats(feats)

In [4]:
# Scaling and PCA 
sc = StandardScaler()
pca = PCA(n_components=2)

In [5]:
# SelectKBest
selection = SelectKBest(k=4)

In [6]:
# Random Forest
rf = RandomForestClassifier()

#### Combining Everything Into One Pipeline
As in the tutorial yesterday we will apply two different feature extraction techniques:

* PCA
* SelectKBest

and combine them with FeatureUnion. The small difference is that we will use only sample of features for PCA

In [7]:
PCA_pipeline = Pipeline([
    ("rawFeats", raw_feats),
    ("scaler", sc),
    ("pca", pca)
])

kbest_pipeline = Pipeline([("kBest", selection)])

In [8]:
# Now, we will combine these ouputs with FeatureUnion:

all_features = FeatureUnion([
    ("pcaPipeline", PCA_pipeline), 
    ("kBestPipeline", kbest_pipeline)
])

In [10]:
# Now, we will create the main pipeline which ends with Regressor.

main_pipeline = Pipeline([
    ("features", all_features),
    ("rf", rf)
])

In [11]:
# Let's apply grid search to tune the parameters properly:

# set up our parameters grid
param_grid = {"features__pcaPipeline__pca__n_components": [1, 2, 3],
                  "features__kBestPipeline__kBest__k": [1, 2, 3],
                  "rf__n_estimators":[2, 5, 10],
                  "rf__max_depth":[2, 4, 6]
             }

# create a Grid Search object
grid_search = GridSearchCV(main_pipeline, param_grid, n_jobs = -1, verbose=10, refit=True)    

# fit the model and tune parameters
grid_search.fit(df, y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('pcaPipeline',
                                                                        Pipeline(steps=[('rawFeats',
                                                                                         <__main__.RawFeats object at 0x7fbc10c97310>),
                                                                                        ('scaler',
                                                                                         StandardScaler()),
                                                                                        ('pca',
                                                                                         PCA(n_components=2))])),
                                                                       ('kBestPipeline',
                                                                        Pipeline(steps=[('kBest',
                    

In [12]:
# We were able to call the pipeline on the original dataset without any transformations. We can check the best combination of parameters:

print(grid_search.best_params_)

{'features__kBestPipeline__kBest__k': 3, 'features__pcaPipeline__pca__n_components': 1, 'rf__max_depth': 2, 'rf__n_estimators': 5}


In [15]:
# We will use pickle to store the model onto our disk.

pickle.dump( grid_search, open( "saved_models/model.p", "wb" ) )