# Pipelines, feature & text preprocessing
### The pipeline workflow
- Repeatable way to go from raw data to trained model
- Pipeline object takes sequential list of steps
 - Output of one step is input to next step
- Each step is a tuple with two elements
 - Name: string
 - Transform: obj implementing.fit() and .transform()
- Flexible: a step can itself be another pipeline!

### Instantiate simple pipeline with one step

In [1]:
import pandas as pd 
import numpy as np

sample_df = pd.read_csv('sample_df.csv', index_col=0)

In [2]:
sample_df.columns

Index(['numeric', 'text', 'with_missing', 'label'], dtype='object')

In [3]:
sample_df = sample_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [5]:
pl = Pipeline([
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                        sample_df[['numeric']],
                                        pd.get_dummies(sample_df['label']),
                                        random_state=2)

pl.fit(X_train, y_train)

accuracy = pl.score(X_test, y_test)

print('accuracy on numeric data, no nans: ', accuracy)

accuracy on numeric data, no nans:  0.652


In [7]:
from sklearn.preprocessing import Imputer

In [8]:
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing']],
                                                   pd.get_dummies(sample_df['label']),
                                                   random_state=2)
pl = Pipeline([
    ('imp', Imputer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

pl.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [9]:
accuracy = pl.score(X_test, y_test)

print('accuracy on all numeric, incl nans: ', accuracy)

accuracy on all numeric, incl nans:  0.648


In [10]:
# Import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

# Import other necessary modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Split and select numeric data only, no nans 
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=22)

# Instantiate Pipeline object: pl
pl = Pipeline([
        ('imp', Imputer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# Fit the pipeline to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\naccuracy on all numeric, incl nans: ", accuracy)


accuracy on all numeric, incl nans:  0.62


### Text features and feature unions
- Preprocessing text features

In [11]:
sample_df['text'] = sample_df['text'].where((pd.notnull(sample_df['text'])), 'none')
#sample_df['text']

In [12]:
# Import the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Split out only the text data
X_train, X_test, y_train, y_test = train_test_split(sample_df['text'],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=456)

# Instantiate Pipeline object: pl
pl = Pipeline([
        ('vec', CountVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# Fit to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on sample data - just text data: ", accuracy)


Accuracy on sample data - just text data:  0.808


In [13]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Obtain the text data: get_text_data
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)

# Obtain the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)

# Fit and transform the text data: just_text_data
just_text_data = get_text_data.fit_transform(sample_df)

# Fit and transform the numeric data: just_numeric_data
just_numeric_data = get_numeric_data.fit_transform(sample_df)

# Print head to check results
print('Text Data')
print(just_text_data.head())

Text Data
0       none
1        foo
2    foo bar
3       none
4    foo bar
Name: text, dtype: object


In [14]:
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Split using ALL data in sample_df
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing', 'text']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=22)

# Create a FeatureUnion with nested pipeline: process_and_join_features
process_and_join_features = FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )

# Instantiate nested pipeline: pl
pl = Pipeline([
        ('union', process_and_join_features),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])


# Fit pl to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on sample data - all data: ", accuracy)


Accuracy on sample data - all data:  0.932


### Choosing a classification model

In [15]:
LABELS = ['Function',
 'Use',
 'Sharing',
 'Reporting',
 'Student_Type',
 'Position_Type',
 'Object_Type',
 'Pre_K',
 'Operating_Status']

In [16]:
df = pd.read_csv('TrainingData.csv', index_col=0)

In [17]:
NON_LABELS = [c for c in df.columns if c not in LABELS]

In [18]:
NUMERIC_COLUMNS = ['FTE', 'Total']

In [19]:
len(NON_LABELS) - len(NUMERIC_COLUMNS)

14

In [21]:
from warnings import warn


def multilabel_sample(y, size=1000, min_count=5, seed=None):
    """ Takes a matrix of binary labels `y` and returns
        the indices for a sample of size `size` if
        `size` > 1 or `size` * len(y) if size =< 1.
        The sample is guaranteed to have > `min_count` of
        each label.
    """
    try:
        if (np.unique(y).astype(int) != np.array([0, 1])).any():
            raise ValueError()
    except (TypeError, ValueError):
        raise ValueError('multilabel_sample only works with binary indicator matrices')

    if (y.sum(axis=0) < min_count).any():
        raise ValueError('Some classes do not have enough examples. Change min_count if necessary.')

    if size <= 1:
        size = np.floor(y.shape[0] * size)

    if y.shape[1] * min_count > size:
        msg = "Size less than number of columns * min_count, returning {} items instead of {}."
        warn(msg.format(y.shape[1] * min_count, size))
        size = y.shape[1] * min_count

    rng = np.random.RandomState(seed if seed is not None else np.random.randint(1))

    if isinstance(y, pd.DataFrame):
        choices = y.index
        y = y.values
    else:
        choices = np.arange(y.shape[0])

    sample_idxs = np.array([], dtype=choices.dtype)

    # first, guarantee > min_count of each label
    for j in range(y.shape[1]):
        label_choices = choices[y[:, j] == 1]
        label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
        sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])

    sample_idxs = np.unique(sample_idxs)

    # now that we have at least min_count of each, we can just random sample
    sample_count = int(size - sample_idxs.shape[0])

    # get sample_count indices from remaining choices
    remaining_choices = np.setdiff1d(choices, sample_idxs)
    remaining_sampled = rng.choice(remaining_choices,
                                   size=sample_count,
                                   replace=False)

    return np.concatenate([sample_idxs, remaining_sampled])


def multilabel_sample_dataframe(df, labels, size, min_count=5, seed=None):
    """ Takes a dataframe `df` and returns a sample of size `size` where all
        classes in the binary matrix `labels` are represented at
        least `min_count` times.
    """
    idxs = multilabel_sample(labels, size=size, min_count=min_count, seed=seed)
    return df.loc[idxs]


def multilabel_train_test_split(X, Y, size, min_count=5, seed=None):
    """ Takes a features matrix `X` and a label matrix `Y` and
        returns (X_train, X_test, Y_train, Y_test) where all
        classes in Y are represented at least `min_count` times.
    """
    index = Y.index if isinstance(Y, pd.DataFrame) else np.arange(Y.shape[0])

    test_set_idxs = multilabel_sample(Y, size=size, min_count=min_count, seed=seed)
    train_set_idxs = np.setdiff1d(index, test_set_idxs)

    test_set_mask = index.isin(test_set_idxs)
    train_set_mask = ~test_set_mask

    return (X[train_set_mask], X[test_set_mask], Y[train_set_mask], Y[test_set_mask])

In [24]:
dummy_labels = pd.get_dummies(df[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(
                                        df[NON_LABELS], dummy_labels,
                                        0.2)

In [25]:
# Define combine_text_columns()
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # Replace nans with blanks
    text_data.fillna('', inplace=True)
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [None]:
#combine_text_columns = combine_text_columns(df)

In [26]:
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

In [28]:
%%time
pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', Imputer())
        ])), 
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])
    ),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

Wall time: 2 ms


In [29]:
%%time
pl.fit(X_train, y_train)

Wall time: 13min 10s


Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_features', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x0000000011C78D08>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [31]:
%%time
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)


Accuracy on budget dataset:  0.37986384360751985
Wall time: 4.39 s


In [33]:
%%time
from sklearn.ensemble import RandomForestClassifier

pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', Imputer())
        ])), 
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])
    ),
    ('clf', RandomForestClassifier())
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)


Accuracy on budget dataset:  0.9057023296483667
Wall time: 5min 1s


In [34]:
%%time
from sklearn.ensemble import RandomForestClassifier

pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', Imputer())
        ])), 
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])
    ),
    ('clf', RandomForestClassifier(n_estimators=15))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)


Accuracy on budget dataset:  0.9137842733120979
Wall time: 7min 28s


In [86]:
holdout = pd.read_csv('TestData.csv', index_col=0)
holdout.shape

  interactivity=interactivity, compiler=compiler, result=result)


(50064, 16)

In [74]:
# holdout = holdout[NUMERIC_COLUMNS].fillna(-1000)
# holdout.info()

In [75]:
%%time
predictions = pl.predict_proba(holdout)

Wall time: 17.6 s


In [83]:
#predictions

In [87]:
columns=pd.get_dummies(df[LABELS],
                        prefix_sep='__',).columns
len(columns)

104

In [68]:
predictions_df = pd.DataFrame(list(map(np.ravel, predictions))).T
predictions_df.columns = columns

In [85]:
predictions_df.shape

(100128, 104)

In [78]:
# test_index = pd.read_csv('TestData.csv', index_col=0).index
# test_index

  interactivity=interactivity, compiler=compiler, result=result)


Int64Index([180042,  28872, 186915, 412396, 427740,  69847, 358824, 254148,
               296, 416755,
            ...
            356796, 130696, 287341, 345215, 113795, 169063, 433255, 232204,
            171685, 249087],
           dtype='int64', length=50064)

In [79]:
predictions_df.index = pd.read_csv('TestData.csv', index_col=0).index
predictions_df.head()

ValueError: Length mismatch: Expected axis has 100128 elements, new values have 50064 elements

In [65]:
# predictions_df = pd.DataFrame(predictions, columns=pd.get_dummies(df[LABELS],
#                                prefix_sep='__',).columns,
#                                index=holdout.index)

In [66]:
predictions_df.to_csv('predictions.csv')