# Pipeline: Heterogenous data

This notebook implements a pipeline for heterogeneous data.


sources:
Sample pipeline for text feature extraction and evaluation: https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

Metrics and scoring: quantifying the quality of predictions: https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values


Demonstration of multi-metric evaluation on cross_val_score and GridSearchCV: https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html


ColumnTransformer for heterogeneous data: 
https://scikit-learn.org/stable/modules/compose.html#columntransformer-for-heterogeneous-data


Column Transformer with Heterogeneous Data Sources: https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer.html

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import munge_help
from time import time

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.preprocessing import MinMaxScaler


import utils

import xgboost as xgb

### Load Data

In [2]:
X_train = utils.load_obj(path=os.path.join('data_processed', 'X_train.pkl'))
y_train = utils.load_obj(path=os.path.join('data_processed', 'y_train.pkl'))

In [3]:
X_train.shape

(102492, 112)

In [4]:
#see how many columns are numbers
X_train.select_dtypes(np.number).shape

(102492, 111)

In [5]:
X_train.dtypes

description               object
attackComplexity_V3        int64
privilegesRequired_V3      int64
userInteraction_V3         int64
scope_V3                   int64
                          ...   
Type_Windows_x86-64      float64
Type_XML                 float64
Type_iOS                 float64
Type_macOS               float64
Type_watchOS             float64
Length: 112, dtype: object

In [6]:
#Control the balance of positive and negative weights, useful for unbalanced classes
#A typical value to consider:
# sum(negative instances) / sum(positive instances)
scale_pos_weight = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

scale_pos_weight

ModuleNotFoundError: No module named 'numpy.core._multiarray_umath'

88.82646801051709

In [7]:
#define preprocessor
preprocessor = ColumnTransformer([('tfidfvect',
                           TfidfVectorizer(ngram_range=(1,3)), 
                           'description')
                         ],
                         remainder=MinMaxScaler(),
                         n_jobs=-1
                        )

#define pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('clf', xgb.XGBClassifier(n_estimators=100,
                                                scale_pos_weight = scale_pos_weight,
                                                eta=0.9,
                                                num_boost_round=15,
                                               )
                      )
                          ])
pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(n_jobs=-1, remainder=MinMaxScaler(),
                                   transformers=[('tfidfvect',
                                                  TfidfVectorizer(ngram_range=(1,
                                                                               3)),
                                                  'description')])),
                ('clf',
                 XGBClassifier(eta=0.9, num_boost_round=15,
                               scale_pos_weight=88.82646801051709))])

In [8]:
#params for grird search
#note the double __ to get to nested elements
parameters = {
    'preprocessor__tfidfvect__max_df': (0.8, 0.9),
    'preprocessor__tfidfvect__min_df': (0.1, 0.15),
    'preprocessor__tfidfvect__max_features': (200, 250),
    'clf__max_depth': (6, 12), #Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit
    'clf__subsample': (0.5, 0.9) #take part of train data to avoid overfitting

}

#instantiate grid search
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           n_jobs=-1, 
                           verbose=10, #lots of details
                           scoring=['roc_auc', 'f1'],
                           refit='roc_auc', 
                           return_train_score=True
                          )

# start the timer
t0 = time()

grid_search.fit(X_train, y_train)

#time to do it
print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 24.3min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 42.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 52.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 67.9min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 78.2min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 95.2min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 119.6min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 147.9min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 168.9min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 203.4min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 241.1min finished


done in 14650.303s


In [9]:
print("Best score: %0.3f" % grid_search.best_score_)
print('\n')
print(20*'#')
print('\n')
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.894


####################


Best parameters set:
	clf__max_depth: 6
	clf__subsample: 0.5
	preprocessor__tfidfvect__max_df: 0.8
	preprocessor__tfidfvect__max_features: 200
	preprocessor__tfidfvect__min_df: 0.1


In [10]:
utils.save_obj(obj = grid_search,
               path = os.path.join('artifacts',
                                   'grid_search_all_features_2020-11-29.pkl'))