In [None]:
# Core libraries
import numpy as np
import pandas as pd

# Suppress warnings. Comment out if required.
import warnings
warnings.filterwarnings("ignore")

# ML libraries
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper

# Python lightweight pipelining
from joblib import dump, load


## Functions

In [None]:
def evaluation(pipeline, X, y):
    y_predict_proba = pipeline.predict_proba(X)[:, 1]
    return{
        'auc': roc_auc_score(y, y_predict_proba)
    }

## Create dataset

In [None]:
numerical_features = ['feat_1', 'feat_2', 'feat_3', 'feat_4']
categorical_features = ['feat_5', 'feat_6', 'feat_7', 'feat_8']

# Generate a dataset with 4 classes using 10k samples
# Target variable y has approximately 50% 0 and 50% 1
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.5])

# For each categorical column: 
#   1. a random between 2 and 10 is chosen (this is variable num_classes)
#   2. the entire categorical column is filled with random numbers between 0 and (num_classes -1)
#   3. reshape the output to be a single column of 10000 numbers
#   4. horizontally append the column to X
for col in range(4):
    num_classes = np.random.randint(2, 10)
    # Numpy reshape(-1, 1)
    #   number of rows = -1 (unknown, Numpy figures it out)
    #   number of columns = 1
    cat_col = np.random.randint(num_classes, size=X.shape[0]).reshape(-1,1)
    X = np.hstack((X, cat_col))

# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

# Make_classification gives us a normally distributed dataset
# Code shifts the mean and standard deviation to make it more realistic
for col in numerical_features:
    mean = np.random.randint(10, 1000)
    std = np.random.randint(1, 100)
    X[col] = X[col].apply(lambda x: mean + std * x).astype(int)

# Categories converted to string values to force pre-processing later
for col in categorical_features:
    X[col] = X[col].apply(lambda x: f'str_{x}' if np.isnan(x)==False else x)

# Introduce Nans into the dataset
# frac=0.7 means that 70% are not Nan
for col in categorical_features + numerical_features:
    X[col] = X[col].sample(frac=0.7)

# Created final DataFrame
df = X.merge(y,left_index=True, right_index=True)

In [None]:
print(f"feat_5: {df['feat_5'].unique()}")
print(f"feat_6: {df['feat_6'].unique()}")
print(f"feat_7: {df['feat_7'].unique()}")
print(f"feat_8: {df['feat_8'].unique()}")

In [None]:
# Numerical columns 1 to 4: mean (between 10 and 1000) + standard deviation (between 1 to 100) * x (float 0 to 1)
df.sample(5)

## Train-test split

In [None]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df['label']
X_test, y_test = test_df[categorical_features + numerical_features], test_df['label']

## Pipeline using CatBoost model


### Categorical features

* Variable 'cat'
* SimpleImputer replaces all Nans with string value 'UNK'
* OrdinalEncoder encodes the categories as 0 .. N where N is the number of classes
  * It also handles new classes that were not in the original training set 
  * Assigns new classes as -1
  * Useful feature when model is used in production


### Numerical features

* Variable 'num'
* SimpleImputer replaces Nans with the mean of the column


### DataFrameMapper

* Groups together data transformations
* Here, we apply the transformations on the categorical 'cat' and numerical 'num' features
* Default output of the mapper is a Numpy array. Setting df_out=True changes the output to a DataFrame

In [None]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)]) for c in categorical_features]
              
num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = CatBoostClassifier(iterations=1000,
                         learning_rate=0.01,
                         metric_period=100)

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)

In [None]:
preprocessed_X_test = mapper.transform(X_test)

In [None]:
# X_test shows the data BEFORE transformations
X_test[numerical_features + categorical_features].head()

In [None]:
# preprocessed_X_test shows the data AFTER transformations
#   Nans in each numerical column are imputed with the mean of the column
#   Values in the each categorical column are ordinal encoded
preprocessed_X_test[numerical_features + categorical_features].head()

In [None]:
# Use Python joblib.dump to persist the pipeline object to file
dump(pipeline, 'params/pipeline.joblib')

# Also save the test dataset to file
test_df.to_csv('params/test_df.csv')

In [None]:
# Train AUC score = 0.932
evaluation(pipeline, X_train, y_train)

In [None]:
# Test AUC score = 0.915
evaluation(pipeline, X_test, y_test)

## Pipeline using Logistic Regression model

### Categorical features

* Variable 'cat'
* SimpleImputer replaces all Nans with string value 'UNK'
* Replace OrdinalEncoder with OneHotEncoder
  * One hot encoding works well with Logistic regression


### Numerical features

* Variable 'num'
* SimpleImputer replaces Nans with the mean of the column
* StandardScaler scales our features to be between 0 and 1


### DataFrameMapper

* Groups together data transformations
* Here, we apply the transformations on the categorical 'cat' and numerical 'num' features
* Default output of the mapper is a Numpy array. Setting df_out=True changes the output to a DataFrame

In [None]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OneHotEncoder()]) for c in categorical_features]
num = [([n], [SimpleImputer(), StandardScaler()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = LogisticRegression()

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)

In [None]:
preprocessed_X_test = mapper.transform(X_test)

In [None]:
# X_test shows the data BEFORE transformations
X_test[numerical_features + categorical_features].head()

In [None]:
# preprocessed_X_test shows the data AFTER transformations
#   Nans in each numerical column are imputed with the mean of the column
#   Values in the each categorical column are one-hot encoded
preprocessed_X_test.head()

In [None]:
# Train AUC score = 0.878
evaluation(pipeline, X_train, y_train)

In [None]:
# Test AUC score = 0.878
evaluation(pipeline, X_test, y_test)