# Custom Pipelines

## Custom Transformer: `t.upper()`

In [25]:
# import packages
import pandas as pd
from sklearn.pipeline import Pipeline

### Classes & Functions

In [26]:
class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func
    
    def transform(self, input_df, **transform_params):
        return self.func(input_df)
    
    def fit(self, x, y=None, **fit_params):
        return self

# this function takes a dataframe as input and returns a modified version thereof
def process_dataframe(input_df):
    input_df['text'] = input_df['text'].map(lambda t: t.upper())
    return input_df

### Example Case

In [27]:
df = pd.DataFrame({
    "id" : [1,2,3,4],
    "text" : ["foo", "Bar", "BAz", "quux"]
})

# this pipeline has a single step
pipeline = Pipeline([
    ('uppercase', DataframeFunctionTransformer(process_dataframe))
])

# apply the pipeline to the input dataframe
pipeline.fit_transform(df)

Unnamed: 0,id,text
0,1,FOO
1,2,BAR
2,3,BAZ
3,4,QUUX


## Custom Transformer: `df.todense()`

### Class `ToDenseTransform`

In [28]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier

class ToDenseTransformer():
    # define the operation
    def transform(self, x, y=None, **fit_params):
        return x.todense()
    
    # return itself
    def fit(self, x, y=None, **fit_params):
        return self

### Example Case

In [29]:
# make matrices dense because PCA does not work with sparse vectors
pipeline = Pipeline([
    ('to_dense', ToDenseTransformer()),
    ('pca', PCA()),
    ('clf', DecisionTreeClassifier())
])

In [30]:
# pipeline.fit(xtrain, ytrain)
# pipeline.predict(xtest)

## Custom Transformer: `df[column].copy()`

### Class `SelectColumnsTransformer`

In [31]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns
    
    def transform(self, x, **transform_params):
        cp_df = x[self.columns].copy()
        return cp_df
    
    def fit(self, x, y=None, **fit_params):
        return self

### Example Case

In [32]:
import numpy as np

df = pd.DataFrame({
    'name' : ['alice', 'bob', 'charlie', 'david', 'edward'],
    'age' : [24, 32, np.nan, 38, 20]
})

In [33]:
# create a pipeline with a single transformer
pipe = Pipeline([
    ('selector', SelectColumnsTransformer(['name']))
])

pipe.fit_transform(df)

Unnamed: 0,name
0,alice
1,bob
2,charlie
3,david
4,edward


#### Using with `ColumnTransformer`

In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [35]:
transformer_step = ColumnTransformer([
    ('impute_mean', SimpleImputer(strategy='mean'), ['age'])
], remainder='passthrough')

pipe = Pipeline([
    ('selector', SelectColumnsTransformer(['age'])),
    ('imputer', transformer_step)
])

pipe.fit(df)
pipe.transform(df)

array([[24. ],
       [32. ],
       [28.5],
       [38. ],
       [20. ]])

## Custom Transformer: `FunctionTransformer`

### What is Stemming?

`Stemming` is the process of getting different morphological variations given a root word. The root word is also called the stem and hence the name stemming. This feature helps us search for words in search engines and other applications easier.

Programs are written for the process of stemming using algorithms called stemming algorithms or stemmers. Most of these are based on rules applying to suffix-stripping. One of the most common is the `porter-stemmer`.

Other applications include:
- systems used for retrieving information such as search engines
- domain analysis for determining domain vocabularies

In [36]:
# import nltk
# nltk.download()
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer

In [37]:
df = pd.DataFrame({
    'text':[
        'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
        'Sed accumsan congue enim non pretium.',
        'In hac habitasse platea dictumst.',
        'Sed tincidunt ipsum nec urna vulputate luctus.'
    ],
    'target':[0, 1, 0, 1]
})

### Class `stem_str(input_series, stemmer)`

In [38]:
def stem_str(input_series, stemmer):
    
    def stem(input_str):
        return ' '.join([stemmer.stem(t) for t in input_str.split(' ')]).strip()
    
    return input_series.apply(stem)

pipeline = Pipeline([
    ('stemmer', FunctionTransformer(
        func=stem_str,
        kw_args={'stemmer' : RSLPStemmer()})),
     ('vect', TfidfVectorizer()),
     ('clf', LogisticRegression())
])

In [39]:
pipeline.fit(df['text'], df['target'])

Pipeline(steps=[('stemmer',
                 FunctionTransformer(func=<function stem_str at 0x14daa5820>,
                                     kw_args={'stemmer': <nltk.stem.rslp.RSLPStemmer object at 0x14dae7100>})),
                ('vect', TfidfVectorizer()), ('clf', LogisticRegression())])

## Pipeline with Preprocessing and Classifier

In [40]:
from sklearn.preprocessing import OneHotEncoder

In [41]:
df = pd.DataFrame({
    'favorite_color' : ['blue','green','red','green','blue'],
    'age' : [10,15,10,np.nan,10],
    'target': [1,0,1,0,1]
})

In [42]:
# instantiate transformers
categorical = Pipeline([('ohe', OneHotEncoder())])
numerical = Pipeline([('imputation', SimpleImputer())])

# define transformer for each column
preprocess = ColumnTransformer([
    ('categorical', categorical, ['favorite_color']),
    ('numerical', numerical, ['age'])
])

In [43]:
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('clf', DecisionTreeClassifier())
])

In [44]:
df_features = df[['favorite_color', 'age']]
df_target = df['target']

In [45]:
pipeline.fit(df_features, df_target)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder())]),
                                                  ['favorite_color']),
                                                 ('numerical',
                                                  Pipeline(steps=[('imputation',
                                                                   SimpleImputer())]),
                                                  ['age'])])),
                ('clf', DecisionTreeClassifier())])