# Custom Pipelines Walkthrough
Kyle Kulas
Tutorial: https://queirozf.com/entries/scikit-learn-pipelines-custom-pipelines-and-pandas-integration

## Custom Transformer example

In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline

class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func
        
    def transform(self, input_df, **transform_params):
        return self.func(input_df)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
# this function takes a dataframe as input and
# returns a modified version 
def process_dataframe(input_df):
    input_df["text"] = input_df["text"].map(lambda t: t.upper())
    return input_df

df = pd.DataFrame({
    'id': [1,2,3,4],
    'text': ['foo', 'bar', 'baz', 'quux']
    })

pipeline = Pipeline([
    ('uppercase', DataframeFunctionTransformer(process_dataframe))
])

pipeline.fit_transform(df)

Unnamed: 0,id,text
0,1,FOO
1,2,BAR
2,3,BAZ
3,4,QUUX


## Custom transformer example: To Dense

In [4]:
import scipy
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

data = scipy.sparse.csr_matrix([
    [1.,0.,0.,0.,0.,0.],
    [0.,1.,0.,0.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [0.,0.,0.,0.,1.,0.],
    [0.,0.,0.,1.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
])

target = np.array([1,1,1,0,0,0,1,1])

class ToDenseTransformer():
    
    def transform(self, x, y=None, **fit_params):
        return x.todense()
    
    def fit(self, x, y=None, **fit_params):
        return self
    
pipeline = Pipeline([
    ('to_dense', ToDenseTransformer()),
    ('pca', PCA()),
    ('clf', DecisionTreeClassifier())
])

pipeline.fit(data, target)
pipeline.predict(data)



array([1, 1, 1, 0, 0, 1, 1, 1])

## Custom Transformer example: Select Dataframe Columns

In [5]:
import pandas as pd
from sklearn.pipeline import Pipeline

class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns
        
    def transform(self, x, **transform_params):
        cpy_df = x[self.columns].copy()
        return cpy_df
    
    def fit(self, x, y=None, **fit_params):
        return self
    
df = pd.DataFrame({
    'name': ['alice', 'bob', 'charlie', 'david', 'edward'],
    'age': [24, 32, np.nan, 38, 20]
})

pipe = Pipeline([
    ('selector', SelectColumnsTransformer(['name']))
])

pipe.fit_transform(df)


Unnamed: 0,name
0,alice
1,bob
2,charlie
3,david
4,edward


## ColumnTransformer example: Missing imputation

In [6]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

df = pd.DataFrame({
    'name':['alice', 'bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

transformer_step = ColumnTransformer([
    ('impute_mean', SimpleImputer(strategy='mean'), ['age'])
     ], remainder='passthrough')
    
pipe = Pipeline([
    ('transformer', transformer_step)
])

pipe.fit(df)

pd.DataFrame(
    data = pipe.transform(df),
    columns=['age', 'name']
)[['name','age']]

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,28.5
3,david,38.0
4,edward,20.0


## FunctionTransformer with Parameters

In [8]:
import pandas as pd

from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer

# dummy dataframe
df = pd.DataFrame({
    'text':[
        'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
        'Sed accumsan congue enim non pretium.',
        'In hac habitasse platea dictumst.',
        'Sed tincidunt ipsum nec urna vulputate luctus.'
    ],
    'target':[0, 1, 0, 1]
})

# function takes a dataframe row and a stemmer
def stem_str(input_series, stemmer):

    def stem(input_str):
        return " ".join([stemmer.stem(t) for t in input_str.split(" ")]).strip()

    return input_series.apply(stem)

pipeline = Pipeline([
    ('stemmer', FunctionTransformer(
        func=stem_str,                        # function to be used
        kw_args={'stemmer': RSLPStemmer()})), # parameters to the function
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

# now use it like you would any pipeline
pipeline.fit(df["text"],df["target"])

LookupError: 
**********************************************************************
  Resource [93mrslp[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('rslp')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mstemmers/rslp/step0.pt[0m

  Searched in:
    - '/home/kyle/nltk_data'
    - '/home/kyle/anaconda3/nltk_data'
    - '/home/kyle/anaconda3/share/nltk_data'
    - '/home/kyle/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


## Pipeline with Preprocessing and Classifier

In [12]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn import set_config

# this is the input dataframe
df = pd.DataFrame({
    'favorite_color':['blue','green','red','green','blue'],
    'age': [10,15,10,np.nan,10],
    'target':[1,0,1,0,1]
})

# define individual transformers in a pipeline
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])

# define which transformer applies to which columns
preprocess = ColumnTransformer([
    ('categorical_preprocessing', categorical_preprocessing, ['favorite_color']),
    ('numerical_preprocessing', numerical_preprocessing, ['age'])
])

# create the final pipeline with preprocessing steps and 
# the final classifier step
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('clf', DecisionTreeClassifier())
])

# now fit the pipeline using the whole dataframe
df_features = df[['favorite_color','age']]
df_target = df['target']

# call fit on the dataframes
pipeline.fit(df_features, df_target)
set_config(display="diagram")
pipeline