In [1]:
# Core libraries
import numpy as np
import pandas as pd

# ML libraries
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper

# Python lightweight pipelining
from joblib import dump, load


## Functions

In [2]:
def evaluation(pipeline, X, y):
    y_predict_proba = pipeline.predict_proba(X)[:, 1]
    return{
        'auc': roc_auc_score(y, y_predict_proba)
    }

## Create dataset

In [3]:
numerical_features = ['feat_1', 'feat_2', 'feat_3', 'feat_4']
categorical_features = ['feat_5', 'feat_6', 'feat_7', 'feat_8']

# Generate a dataset with 4 classes using 10k samples
# Target variable y has approximately 50% 0 and 50% 1
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.5])

# For each categorical column: 
#   1. a random between 2 and 10 is chosen (this is variable num_classes)
#   2. the entire categorical column is filled with random numbers between 0 and (num_classes -1)
#   3. reshape the output to be a single column of 10000 numbers
#   4. horizontally append the column to X
for col in range(4):
    num_classes = np.random.randint(2, 10)
    # Numpy reshape(-1, 1)
    #   number of rows = -1 (unknown, Numpy figures it out)
    #   number of columns = 1
    cat_col = np.random.randint(num_classes, size=X.shape[0]).reshape(-1,1)
    X = np.hstack((X, cat_col))

# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

# Make_classification gives us a normally distributed dataset
# Code shifts the mean and standard deviation to make it more realistic
for col in numerical_features:
    mean = np.random.randint(10, 1000)
    std = np.random.randint(1, 100)
    X[col] = X[col].apply(lambda x: mean + std * x).astype(int)

# Categories converted to string values to force pre-processing later
for col in categorical_features:
    X[col] = X[col].apply(lambda x: f'str_{x}' if np.isnan(x)==False else x)

# Introduce Nans into the dataset
# frac=0.7 means that 70% are not Nan
for col in categorical_features + numerical_features:
    X[col] = X[col].sample(frac=0.7)

# Created final DataFrame
df = X.merge(y,left_index=True, right_index=True)

In [4]:
print(f"feat_5: {df['feat_5'].unique()}")
print(f"feat_6: {df['feat_6'].unique()}")
print(f"feat_7: {df['feat_7'].unique()}")
print(f"feat_8: {df['feat_8'].unique()}")

feat_5: ['str_2.0' nan 'str_1.0' 'str_5.0' 'str_4.0' 'str_3.0' 'str_6.0' 'str_0.0']
feat_6: ['str_4.0' nan 'str_5.0' 'str_1.0' 'str_0.0' 'str_3.0' 'str_7.0' 'str_2.0'
 'str_6.0']
feat_7: [nan 'str_2.0' 'str_6.0' 'str_5.0' 'str_4.0' 'str_1.0' 'str_3.0' 'str_0.0']
feat_8: ['str_2.0' 'str_7.0' 'str_6.0' 'str_1.0' 'str_0.0' nan 'str_3.0' 'str_5.0'
 'str_4.0' 'str_8.0']


In [5]:
# Numerical columns 1 to 4: mean (between 10 and 1000) + standard deviation (between 1 to 100) * x (float 0 to 1)
df.sample(5)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,label
101,,303.0,87.0,9.0,,str_2.0,str_6.0,str_1.0,1
3105,861.0,,,-3.0,str_2.0,str_1.0,,,0
4552,1007.0,,,7.0,str_4.0,str_6.0,str_1.0,str_3.0,0
6916,1000.0,356.0,,,str_1.0,,str_5.0,str_8.0,1
4971,,254.0,89.0,-19.0,,str_2.0,str_6.0,,0


## Train-test split

In [6]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df['label']
X_test, y_test = test_df[categorical_features + numerical_features], test_df['label']

## Preprocessing and training

In [7]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)]) for c in categorical_features]
              
num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = CatBoostClassifier(iterations=1000,
                         learning_rate=0.01,
                         metric_period=100)

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)

0:	learn: 0.6866842	total: 152ms	remaining: 2m 32s
100:	learn: 0.4328268	total: 574ms	remaining: 5.11s
200:	learn: 0.3930714	total: 1.03s	remaining: 4.09s
300:	learn: 0.3796081	total: 1.47s	remaining: 3.42s
400:	learn: 0.3724673	total: 1.91s	remaining: 2.86s
500:	learn: 0.3668861	total: 2.35s	remaining: 2.34s
600:	learn: 0.3618578	total: 2.79s	remaining: 1.85s
700:	learn: 0.3576876	total: 3.21s	remaining: 1.37s
800:	learn: 0.3532125	total: 3.67s	remaining: 912ms
900:	learn: 0.3489426	total: 4.13s	remaining: 454ms
999:	learn: 0.3439124	total: 4.59s	remaining: 0us


In [8]:
preprocessed_X_test = mapper.transform(X_test)

In [9]:
X_test[numerical_features + categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,946.0,200.0,,-4.0,str_6.0,str_7.0,,str_1.0
9001,,,98.0,,str_6.0,str_6.0,str_6.0,
9002,1028.0,,82.0,11.0,str_2.0,str_3.0,,str_1.0
9003,996.0,237.0,80.0,-6.0,str_3.0,,str_1.0,str_3.0
9004,895.0,377.0,83.0,,str_4.0,str_0.0,str_4.0,str_7.0


In [10]:
preprocessed_X_test[numerical_features + categorical_features].head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8
9000,946.0,200.0,85.322524,-4.0,7.0,8.0,0.0,2.0
9001,948.576575,289.930306,98.0,9.525075,7.0,7.0,7.0,0.0
9002,1028.0,289.930306,82.0,11.0,3.0,4.0,0.0,2.0
9003,996.0,237.0,80.0,-6.0,4.0,0.0,2.0,4.0
9004,895.0,377.0,83.0,9.525075,5.0,1.0,5.0,8.0


In [11]:
dump(pipeline, 'params/pipeline.joblib')
test_df.to_csv('params/test_df.csv')

In [12]:
evaluation(pipeline, X_train, y_train)

{'auc': 0.9294635100905286}

In [13]:
evaluation(pipeline, X_test, y_test)

{'auc': 0.9001306529440998}

## Alternative method

In [14]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OneHotEncoder()]) for c in categorical_features]
num = [([n], [SimpleImputer(), StandardScaler()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = LogisticRegression()

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)



In [15]:
preprocessed_X_test = mapper.transform(X_test)



In [16]:
X_test[numerical_features + categorical_features].head().T

Unnamed: 0,9000,9001,9002,9003,9004
feat_1,946.0,,1028.0,996.0,895.0
feat_2,200.0,,,237.0,377.0
feat_3,,98.0,82.0,80.0,83.0
feat_4,-4.0,,11.0,-6.0,
feat_5,str_6.0,str_6.0,str_2.0,str_3.0,str_4.0
feat_6,str_7.0,str_6.0,str_3.0,,str_0.0
feat_7,,str_6.0,,str_1.0,str_4.0
feat_8,str_1.0,,str_1.0,str_3.0,str_7.0


In [17]:
preprocessed_X_test.head().T

Unnamed: 0,9000,9001,9002,9003,9004
feat_1,-0.05884285,-2.596337e-15,1.813842,1.083038,-1.223562
feat_2,-1.088125,0.0,0.0,-0.640438,1.053513
feat_3,-2.136224e-15,1.905722,-0.499453,-0.8001,-0.3491298
feat_4,-0.9585782,1.258978e-16,0.104534,-1.100326,1.258978e-16
feat_5_x0_UNK,0.0,0.0,0.0,0.0,0.0
feat_5_x0_str_0.0,0.0,0.0,0.0,0.0,0.0
feat_5_x0_str_1.0,0.0,0.0,0.0,0.0,0.0
feat_5_x0_str_2.0,0.0,0.0,1.0,0.0,0.0
feat_5_x0_str_3.0,0.0,0.0,0.0,1.0,0.0
feat_5_x0_str_4.0,0.0,0.0,0.0,0.0,1.0


In [18]:
evaluation(pipeline, X_train, y_train)



{'auc': 0.8813342689131578}

In [19]:
evaluation(pipeline, X_test, y_test)



{'auc': 0.86734317638949}