In [1]:
# Core libraries
import numpy as np
import pandas as pd

# ML libraries
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper

# Python lightweight pipelining
from joblib import dump, load


## Functions

In [2]:
def evaluation(pipeline, X, y):
    y_predict_proba = pipeline.predict_proba(X)[:, 1]
    return{
        'auc': roc_auc_score(y, y_predict_proba)
    }

## Create dataset

In [3]:
numerical_features = ['feat_1', 'feat_2', 'feat_3', 'feat_4']
categorical_features = ['feat_5', 'feat_6', 'feat_7', 'feat_8']

# Generate a dataset with 4 classes using 10k samples
# Target variable y has approximately 50% 0 and 50% 1
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.5])

# For each categorical column: 
#   1. a random between 2 and 10 is chosen (this is variable num_classes)
#   2. the entire categorical column is filled with random numbers between 0 and (num_classes -1)
#   3. reshape the output to be a single column of 10000 numbers
#   4. horizontally append the column to X
for col in range(4):
    num_classes = np.random.randint(2, 10)
    # Numpy reshape(-1, 1)
    #   number of rows = -1 (unknown, Numpy figures it out)
    #   number of columns = 1
    cat_col = np.random.randint(num_classes, size=X.shape[0]).reshape(-1,1)
    X = np.hstack((X, cat_col))

# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

# Make_classification gives us a normally distributed dataset
# Code shifts the mean and standard deviation to make it more realistic
for col in numerical_features:
    mean = np.random.randint(10, 1000)
    std = np.random.randint(1, 100)
    X[col] = X[col].apply(lambda x: mean + std * x).astype(int)

# Categories converted to string values to force pre-processing later
for col in categorical_features:
    X[col] = X[col].apply(lambda x: f'str_{x}' if np.isnan(x)==False else x)

# Introduce Nans into the dataset
# frac=0.7 means that 70% are not Nan
for col in categorical_features + numerical_features:
    X[col] = X[col].sample(frac=0.7)

# Created final DataFrame
df = X.merge(y,left_index=True, right_index=True)

In [4]:
print(f"feat_5: {df['feat_5'].unique()}")
print(f"feat_6: {df['feat_6'].unique()}")
print(f"feat_7: {df['feat_7'].unique()}")
print(f"feat_8: {df['feat_8'].unique()}")

feat_5: [nan 'str_6.0' 'str_0.0' 'str_2.0' 'str_1.0' 'str_3.0' 'str_7.0' 'str_5.0'
 'str_4.0']
feat_6: ['str_8.0' nan 'str_2.0' 'str_3.0' 'str_5.0' 'str_4.0' 'str_6.0' 'str_0.0'
 'str_1.0' 'str_7.0']
feat_7: [nan 'str_2.0' 'str_0.0' 'str_1.0' 'str_3.0']
feat_8: ['str_1.0' 'str_2.0' nan 'str_3.0' 'str_0.0' 'str_4.0']


In [5]:
# Numerical columns 1 to 4: mean (between 10 and 1000) + standard deviation (between 1 to 100) * x (float 0 to 1)
df.sample(5)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,label
5320,446.0,,873.0,18.0,,str_8.0,str_2.0,str_2.0,0
2325,523.0,372.0,523.0,13.0,str_0.0,str_7.0,,str_3.0,1
4274,,,780.0,20.0,,str_7.0,str_2.0,str_3.0,0
9137,364.0,355.0,,5.0,,str_2.0,str_2.0,str_4.0,1
7729,279.0,,,,str_4.0,,str_0.0,str_1.0,1


## Train-test split

In [6]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df['label']
X_test, y_test = test_df[categorical_features + numerical_features], test_df['label']

## Pipeline using CatBoost model


### Categorical features

* Variable 'cat'
* SimpleImputer replaces all Nans with string value 'UNK'
* OrdinalEncoder encodes the categories as 0 .. N where N is the number of classes
  * It also handles new classes that were not in the original training set 
  * Assigns new classes as -1
  * Useful feature when model is used in production


### Numerical features

* Variable 'num'
* SimpleImputer replaces Nans with the mean of the column


### DataFrameMapper

* Groups together data transformations
* Here, we apply the transformations on the categorical 'cat' and numerical 'num' features
* Default output of the mapper is a Numpy array. Setting df_out=True changes the output to a DataFrame

In [7]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)]) for c in categorical_features]
              
num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = CatBoostClassifier(iterations=1000,
                         learning_rate=0.01,
                         metric_period=100)

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)

0:	learn: 0.6865305	total: 149ms	remaining: 2m 28s
100:	learn: 0.4378212	total: 576ms	remaining: 5.13s
200:	learn: 0.3958942	total: 1.01s	remaining: 4.01s
300:	learn: 0.3815716	total: 1.45s	remaining: 3.36s
400:	learn: 0.3734201	total: 1.89s	remaining: 2.83s
500:	learn: 0.3675477	total: 2.35s	remaining: 2.34s
600:	learn: 0.3628538	total: 2.79s	remaining: 1.85s
700:	learn: 0.3581536	total: 3.26s	remaining: 1.39s
800:	learn: 0.3533191	total: 3.71s	remaining: 922ms
900:	learn: 0.3486907	total: 4.15s	remaining: 456ms
999:	learn: 0.3432862	total: 4.63s	remaining: 0us


In [8]:
preprocessed_X_test = mapper.transform(X_test)

In [9]:
# X_test shows the data BEFORE transformations
X_test[numerical_features + categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,410.0,305.0,702.0,7.0,,,str_0.0,str_1.0
9001,,348.0,897.0,23.0,str_4.0,str_6.0,str_1.0,
9002,,388.0,727.0,,,str_2.0,,
9003,,319.0,,,str_0.0,,str_3.0,str_2.0
9004,312.0,,,16.0,,str_5.0,str_3.0,str_4.0


In [10]:
# preprocessed_X_test shows the data AFTER transformations
#   Nans in each numerical column are imputed with the mean of the column
#   Values in the each categorical column are ordinal encoded
preprocessed_X_test[numerical_features + categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,410.0,305.0,702.0,7.0,0.0,0.0,1.0,2.0
9001,415.598053,348.0,897.0,23.0,5.0,7.0,2.0,0.0
9002,415.598053,388.0,727.0,11.525496,0.0,3.0,0.0,0.0
9003,415.598053,319.0,763.040317,11.525496,1.0,0.0,4.0,3.0
9004,312.0,339.534551,763.040317,16.0,0.0,6.0,4.0,5.0


In [11]:
# Use Python joblib.dump to persist the pipeline object to file
dump(pipeline, 'params/pipeline.joblib')

# Also save the test dataset to file
test_df.to_csv('params/test_df.csv')

In [12]:
# Train AUC score = 0.932
evaluation(pipeline, X_train, y_train)

{'auc': 0.9315789845357229}

In [13]:
# Test AUC score = 0.915
evaluation(pipeline, X_test, y_test)

{'auc': 0.915239904455025}

## Pipeline using Logistic Regression model

### Categorical features

* Variable 'cat'
* SimpleImputer replaces all Nans with string value 'UNK'
* Replace OrdinalEncoder with OneHotEncoder
  * One hot encoding works well with Logistic regression


### Numerical features

* Variable 'num'
* SimpleImputer replaces Nans with the mean of the column
* StandardScaler scales our features to be between 0 and 1


### DataFrameMapper

* Groups together data transformations
* Here, we apply the transformations on the categorical 'cat' and numerical 'num' features
* Default output of the mapper is a Numpy array. Setting df_out=True changes the output to a DataFrame

In [14]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OneHotEncoder()]) for c in categorical_features]
num = [([n], [SimpleImputer(), StandardScaler()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = LogisticRegression()

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)



In [15]:
preprocessed_X_test = mapper.transform(X_test)



In [20]:
# X_test shows the data BEFORE transformations
X_test[numerical_features + categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,410.0,305.0,702.0,7.0,,,str_0.0,str_1.0
9001,,348.0,897.0,23.0,str_4.0,str_6.0,str_1.0,
9002,,388.0,727.0,,,str_2.0,,
9003,,319.0,,,str_0.0,,str_3.0,str_2.0
9004,312.0,,,16.0,,str_5.0,str_3.0,str_4.0


In [21]:
# preprocessed_X_test shows the data AFTER transformations
#   Nans in each numerical column are imputed with the mean of the column
#   Values in the each categorical column are one-hot encoded
preprocessed_X_test.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5_x0_UNK,feat_5_x0_str_0.0,feat_5_x0_str_1.0,feat_5_x0_str_2.0,feat_5_x0_str_3.0,feat_5_x0_str_4.0,...,feat_7_x0_str_0.0,feat_7_x0_str_1.0,feat_7_x0_str_2.0,feat_7_x0_str_3.0,feat_8_x0_UNK,feat_8_x0_str_0.0,feat_8_x0_str_1.0,feat_8_x0_str_2.0,feat_8_x0_str_3.0,feat_8_x0_str_4.0
9000,-0.0670078,-1.102861,-0.8401967,-0.900846,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9001,-6.804066e-16,0.2703442,1.843904,2.284116,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9002,-6.804066e-16,1.547745,-0.4960812,3.536019e-16,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9003,-6.804066e-16,-0.6557711,-1.564856e-15,3.536019e-16,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
9004,-1.240052,-1.815295e-15,-1.564856e-15,0.8906954,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
evaluation(pipeline, X_train, y_train)



{'auc': 0.8762999552934541}

In [19]:
evaluation(pipeline, X_test, y_test)



{'auc': 0.8888247647445453}