In [1]:
# Core libraries
import numpy as np
import pandas as pd

# ML libraries
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper


In [None]:
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.5])

X

In [2]:
numerical_features = ['feat_1', 'feat_2', 'feat_3', 'feat_4']
categorical_features = ['feat_5', 'feat_6', 'feat_7', 'feat_8']

# Generate a dataset with 4 classes using 10k samples
# Target variable y has approximately 50% 0 and 50% 1
X, y = make_classification(n_samples=10000, 
                           n_features=4, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.5])

# Add categorical columns
for col in range(4):
    # Number of classes = randomly pick between 2 and 10
    num_classes = np.random.randint(2, 10)
    # Numpy reshape(-1, 1)
    #   number of rows = -1 (unknown, Numpy figures it out)
    #   number of columns = 1
    # cat_col is an array of 10000 rows by 1 column
    #   values are between 0 and (num_features-1)
    cat_col = np.random.randint(num_classes, size=X.shape[0]).reshape(-1,1)
    # Concatenate columns of X and cat_col
    X = np.hstack((X, cat_col))

# To DataFrame
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

# Scale regressors, modify categoricals
for col in numerical_features:
    mean = np.random.randint(10, 1000)
    std = np.random.randint(1, 100)
    X[col] = X[col].apply(lambda x: mean + std * x).astype(int)

for col in categorical_features:
    X[col] = X[col].apply(lambda x: f'str_{x}' if np.isnan(x)==False else x)

# Create Nans in dataset
for col in categorical_features + numerical_features:
    X[col] = X[col].sample(frac=0.7)
    
df = X.merge(y,left_index=True, right_index=True)

In [28]:
# Numerical columns 1 to 4: mean (between 10 and 1000) + standard deviation (between 1 to 100) * x (float 0 to 1)
# Categorical columns 5 to 8: 
df.sample(100)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,label
3256,854.0,905.0,826.0,,str_1.0,str_4.0,str_5.0,,1
8445,872.0,,792.0,627.0,,str_2.0,str_0.0,str_5.0,0
9915,879.0,1010.0,885.0,,str_0.0,str_5.0,str_3.0,str_6.0,1
8907,874.0,882.0,915.0,642.0,,str_3.0,str_5.0,str_3.0,0
5661,869.0,929.0,678.0,475.0,str_1.0,str_2.0,str_2.0,str_4.0,1
...,...,...,...,...,...,...,...,...,...
8295,,816.0,905.0,357.0,str_1.0,,,str_3.0,0
1254,846.0,,845.0,,str_1.0,,str_4.0,str_0.0,1
9488,,779.0,,547.0,,str_6.0,str_4.0,,0
403,866.0,,,649.0,str_1.0,str_0.0,,str_6.0,0


In [4]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
print(train_df.shape)
print(test_df.shape)

(9000, 9)
(1000, 9)


## Train-test split

In [5]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df['label']
X_test, y_test = test_df[categorical_features + numerical_features], test_df['label']

## Preprocessing and training

In [6]:
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)]) for c in categorical_features]
              
num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num + cat, df_out=True)
clf = CatBoostClassifier(iterations=1000,
                         learning_rate=0.01,
                         metric_period=100)

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)

0:	learn: 0.6862724	total: 148ms	remaining: 2m 27s
100:	learn: 0.4302458	total: 573ms	remaining: 5.1s
200:	learn: 0.3920143	total: 1.02s	remaining: 4.06s
300:	learn: 0.3771545	total: 1.46s	remaining: 3.4s
400:	learn: 0.3692853	total: 1.89s	remaining: 2.83s
500:	learn: 0.3633671	total: 2.39s	remaining: 2.38s
600:	learn: 0.3580785	total: 2.86s	remaining: 1.9s
700:	learn: 0.3532000	total: 3.29s	remaining: 1.41s
800:	learn: 0.3482113	total: 3.74s	remaining: 929ms
900:	learn: 0.3433872	total: 4.15s	remaining: 456ms
999:	learn: 0.3380664	total: 4.58s	remaining: 0us
