# Pipelines, feature & text preprocessing
### The pipeline workflow
- Repeatable way to go from raw data to trained model
- Pipeline object takes sequential list of steps
 - Output of one step is input to next step
- Each step is a tuple with two elements
 - Name: string
 - Transform: obj implementing.fit() and .transform()
- Flexible: a step can itself be another pipeline!

### Instantiate simple pipeline with one step

In [None]:
def X

In [1]:
import pandas as pd 
import numpy as np

sample_df = pd.read_csv('sample_df.csv', index_col=0)

In [2]:
sample_df.columns

Index(['numeric', 'text', 'with_missing', 'label'], dtype='object')

In [3]:
sample_df = sample_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [5]:
pl = Pipeline([
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                        sample_df[['numeric']],
                                        pd.get_dummies(sample_df['label']),
                                        random_state=2)

pl.fit(X_train, y_train)

accuracy = pl.score(X_test, y_test)

print('accuracy on numeric data, no nans: ', accuracy)

accuracy on numeric data, no nans:  0.652


In [7]:
from sklearn.preprocessing import Imputer

In [8]:
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing']],
                                                   pd.get_dummies(sample_df['label']),
                                                   random_state=2)
pl = Pipeline([
    ('imp', Imputer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

pl.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [9]:
accuracy = pl.score(X_test, y_test)

print('accuracy on all numeric, incl nans: ', accuracy)

accuracy on all numeric, incl nans:  0.648


In [10]:
# Import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

# Import other necessary modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Split and select numeric data only, no nans 
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=22)

# Instantiate Pipeline object: pl
pl = Pipeline([
        ('imp', Imputer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# Fit the pipeline to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\naccuracy on all numeric, incl nans: ", accuracy)


accuracy on all numeric, incl nans:  0.62


### Text features and feature unions
- Preprocessing text features

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
X_train, X_test, y_train, y_test = train_test_split(sample_df['text'],
                                                   pd.get_dummies(
                                                   sample_df['label']),
                                                   random_state=2)

pl = Pipeline ([
    ('vec', CountVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])