<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Student-Model" data-toc-modified-id="Student-Model-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Student Model</a></span><ul class="toc-item"><li><span><a href="#Data-processing" data-toc-modified-id="Data-processing-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Data processing</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Model</a></span><ul class="toc-item"><li><span><a href="#Define-Model" data-toc-modified-id="Define-Model-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Define Model</a></span></li><li><span><a href="#Train" data-toc-modified-id="Train-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Train</a></span></li></ul></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Evaluation</a></span><ul class="toc-item"><li><span><a href="#ROC-AUC" data-toc-modified-id="ROC-AUC-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>ROC AUC</a></span></li><li><span><a href="#Compression-rate" data-toc-modified-id="Compression-rate-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Compression rate</a></span></li></ul></li></ul></li></ul></div>

# Student Model


Нужно обучть небольшую модель на [soft таргетах](https://drive.google.com/file/d/1tBbPOUT-Ow9f3zTDApykGXYwt-KslYle/view?usp=sharing)  модели учителя, которая не сильно уступала бы в качестве учителю.

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from deepctr.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr.models.dcn import DCN

from collections import defaultdict

Please check the latest version manually on https://pypi.org/project/deepctr/#history


In [2]:
DATA_PATH = '/workspace/data/criteo'

TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')
SOFT_PATH = 'soft_targets_full.csv'

## Data processing

Данные на Train/Validation/Test нужно разбить как 80/10/10

In [3]:
dense_features_indices = [i for i in range(1, 14)]
sparse_features_indices = [i for i in range(14, 40)]

dense_features = ['c{}'.format(i) for i in dense_features_indices]
sparse_features = ['c{}'.format(i) for i in sparse_features_indices]

len(dense_features_indices), len(sparse_features_indices)

(13, 26)

In [4]:
data = pd.read_csv(TRAIN_PATH, index_col='id')
data.rename(columns=dict([(col, col[1:] if col[0] == '_' else col) for col in data.columns]), inplace=True)
soft_data = pd.read_csv(SOFT_PATH, index_col='id')

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
data['soft_c0'] = soft_data
targets = ['c0', 'soft_c0']
soft_target = 'soft_c0'
hard_target = 'c0'

  mask |= (ar1 == a)


In [5]:
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [6]:
train, test = train_test_split(data, test_size=0.2, shuffle=False)
val, test = train_test_split(test, test_size=0.5, shuffle=False)
print(len(train))
print(len(val))
print(len(test))

2931944
366493
366494


In [7]:
sparse_features_dims = dict([
    ('c14', 1445),
    ('c15', 556),
    ('c16', 1130758),
    ('c17', 360209),
    ('c18', 304),
    ('c19', 21),
    ('c20', 11845),
    ('c21', 631),
    ('c22', 3),
    ('c23', 49223),
    ('c24', 5194),
    ('c25', 985420),
    ('c26', 3157),
    ('c27', 26),
    ('c28', 11588),
    ('c29', 715441),
    ('c30', 10),
    ('c31', 4681),
    ('c32', 2029),
    ('c33', 4),
    ('c34', 870796),
    ('c35', 17),
    ('c36', 15),
    ('c37', 87605),
    ('c38', 84),
    ('c39', 58187)])

In [8]:
def gen_model_input(df):
    feature_names = dense_features + sparse_features
    return {name: (pd.core.series.Series(df[name]) if name in sparse_features else np.array(df[name]))
            for name in feature_names}

train_input = gen_model_input(train)
val_input = gen_model_input(val)
test_input = gen_model_input(test)

## Model

Можно также использовать Pruning и/или Quantinization.

### Define Model

In [9]:
BATCH = 1024

In [12]:
import tensorflow as tf

def hyb_loss(q, g):
    w = 0.9
    l1 = tf.keras.backend.binary_crossentropy(tf.gather(q, [1], axis=1), tf.gather(g, [1], axis=1),
                                              from_logits=False)
    l2 = tf.keras.backend.binary_crossentropy(tf.gather(q, [1], axis=1), tf.gather(g, [0], axis=1),
                                              from_logits=False)
    return w * l1 + (1 - w) * l2

In [11]:
import tensorflow as tf

def make_dcn_model(mode='mse', max_voc_size=50000, max_emb_dim=100, dnn_hidden=(128, 128)):
    fixlen_feature_columns = [SparseFeat(feat, 
                                     vocabulary_size=min(vocab_size, max_voc_size),
                                     embedding_dim=min(int(6 * (vocab_size) ** (0.25)), max_emb_dim), 
                                     use_hash=True, dtype='string') 
                          for feat, vocab_size in sparse_features_dims.items()] + \
                        [DenseFeat(feat, 1,) for feat in dense_features]
    model = DCN(fixlen_feature_columns, fixlen_feature_columns, cross_num=2,
            dnn_hidden_units=dnn_hidden, l2_reg_linear=0, l2_reg_embedding=0,
            l2_reg_cross=0, l2_reg_dnn=0, init_std=0.0001, seed=1024, 
            dnn_use_bn=True, dnn_activation='relu', task='binary')
    
    if mode == "mse":
        model.compile("adam", tf.keras.losses.MeanSquaredError(), )
        
        model.fit(train_input, train[soft_target].values,
                  batch_size=BATCH, use_multiprocessing=True,
                  validation_data = (val_input, val[soft_target].values))
    elif mode == "hyb":
        model.compile("adam", hyb_loss, )
        
        model.fit(train_input, train[targets].values,
                  batch_size=BATCH, use_multiprocessing=True,
                  validation_data = (val_input, val[targets].values))
    else:
        print("No such mode")
        return None
        
    return model

### Train

In [20]:
mse_hid32_model = make_dcn_model('mse', dnn_hidden=(32, 32))

Train on 2931944 samples, validate on 366493 samples


In [21]:
mse_emb16_model = make_dcn_model('mse', max_emb_dim=16, max_voc_size=30000)

Train on 2931944 samples, validate on 366493 samples


In [22]:
mse_emb32_model = make_dcn_model('mse', max_emb_dim=32)

Train on 2931944 samples, validate on 366493 samples


In [27]:
mse_hid32_voc10k_model = make_dcn_model('mse', dnn_hidden=(32, 32), max_voc_size=10000)

Train on 2931944 samples, validate on 366493 samples


In [None]:
hyb_hid32_model = make_dcn_model('hyb', dnn_hidden=(32, 32))

In [None]:
hyb_emb32_model = make_dcn_model('mse', max_emb_dim=32)

## Evaluation

Наша основная задача получить модель, которая 
* в терминах ROC AUC не намного хуже модели учителя, и в то же время 
* сильно меньше по размеру

### ROC AUC

Сравним ROC AUC модели ученика с показателем для учителя.

ROC AUC учителя: 0.802

In [28]:
results = {}
TEACHER_SIZE = 168
TEACHER_AUC = 0.802

In [29]:
mse_hid32_ans = mse_hid32_model.predict(test_input, batch_size=BATCH)
tf.keras.models.save_model(mse_hid32_model, "tmp.h5", include_optimizer=False)
results['mse_hid32'] = {'auc': roc_auc_score(test[hard_target].values, mse_hid32_ans),
                        'comprassion_rate': round(TEACHER_SIZE / (os.path.getsize("tmp.h5") / float(2**20)), 4)}

In [30]:
mse_emb32_ans = mse_emb32_model.predict(test_input, batch_size=BATCH)
tf.keras.models.save_model(mse_emb32_model, "tmp.h5", include_optimizer=False)
results['mse_emb32'] = {'auc': round(roc_auc_score(test[hard_target].values, mse_emb32_ans), 4),
                        'comprassion_rate': round(TEACHER_SIZE / (os.path.getsize("tmp.h5") / float(2**20)), 4)}

In [31]:
mse_emb16_ans = mse_emb16_model.predict(test_input, batch_size=BATCH)
mse_emb16_model.save_weights("tmp.h5")
results['mse_emb16'] = {'auc': round(roc_auc_score(test[hard_target].values, mse_emb16_ans), 4),
                        'comprassion_rate': round(TEACHER_SIZE / (os.path.getsize("tmp.h5") / float(2**20)), 4)}

In [32]:
mse_hid32_voc10k_ans = mse_hid32_voc10k_model.predict(test_input, batch_size=BATCH)
mse_hid32_voc10k_model.save_weights("tmp.h5")
results['mse_hid32_voc10k'] = {'auc': round(roc_auc_score(test[hard_target].values, mse_hid32_voc10k_ans), 4),
                        'comprassion_rate': round(TEACHER_SIZE / (os.path.getsize("tmp.h5") / float(2**20)), 4)}

In [33]:
for key in results:
    tmp = results[key]
    tmp['auc_rate']=round(tmp['auc'] / TEACHER_AUC, 4)
    results[key] = {'auc_rate': round(tmp['auc'] / TEACHER_AUC, 4), 'comprassion_rate': tmp['comprassion_rate']}

### Compression rate

Пусть 
* $a$ - \# of the parameters in the original model $M$
* $a^{*}$ - \# of the parameters in compressed model $M^{*}$

тогда compression rate is $$\alpha(M,M^{*}) = \frac{a}{a^{*}}$$

Можно также посчитать comression rate просто как отношение фактических размеров моделей.

Размер модели учителя - 168MB


In [34]:
results

{'mse_emb16': {'auc_rate': 0.9814, 'comprassion_rate': 8.9473},
 'mse_emb32': {'auc_rate': 0.9852, 'comprassion_rate': 2.9907},
 'mse_hid32': {'auc_rate': 0.9836, 'comprassion_rate': 1.0521},
 'mse_hid32_voc10k': {'auc_rate': 0.9778, 'comprassion_rate': 4.3605}}

## Pruning

[link](https://www.tensorflow.org/model_optimization/guide/pruning/pruning_with_keras)

In [10]:
import tensorflow_model_optimization
from tensorflow_model_optimization.python.core.api.sparsity import keras as sparsity

end_step = np.ceil(1.0 * len(train) / BATCH).astype(np.int32) * 10

def make_pruned_model(max_voc_size=50000, max_emb_dim=100, dnn_hidden=(128, 128), steps=3):
    fixlen_feature_columns = [SparseFeat(feat, 
                                     vocabulary_size=min(vocab_size, max_voc_size),
                                     embedding_dim=min(int(6 * (vocab_size) ** (0.25)), max_emb_dim), 
                                     use_hash=True, dtype='string') 
                          for feat, vocab_size in sparse_features_dims.items()] + \
                        [DenseFeat(feat, 1,) for feat in dense_features]
    model = DCN(fixlen_feature_columns, fixlen_feature_columns, cross_num=2,
            dnn_hidden_units=dnn_hidden, l2_reg_linear=0, l2_reg_embedding=0,
            l2_reg_cross=0, l2_reg_dnn=0, init_std=0.0001, seed=1024, 
            dnn_use_bn=True, dnn_activation='relu', task='binary')
    model.compile("adam", tf.keras.losses.MeanSquaredError(), )
    
    for i in range(steps):
        print("Step", i)
        pruning_params = {
              'pruning_schedule': sparsity.PolynomialDecay(initial_sparsity=0.50,
                                                   final_sparsity=0.90,
                                                   begin_step=2000,
                                                   end_step=end_step,
                                                   frequency=100)
        }
        model = sparsity.prune_low_magnitude(model, **pruning_params)
        model.compile("adam", tf.keras.losses.MeanSquaredError(), )
        callbacks = [sparsity.UpdatePruningStep()]
        model.fit(train_input, train[soft_target].values,
                  batch_size=BATCH, use_multiprocessing=True,
                  validation_data = (val_input, val[soft_target].values),
                  callbacks = callbacks)
        
        model = sparsity.strip_pruning(model)
    return model

In [13]:
mse_emb32_pruned_model = make_pruned_model(max_voc_size=30000, max_emb_dim=32, dnn_hidden=(128, 128))

Step 0
Instructions for updating:
Please use `layer.add_weight` method instead.
Train on 2931944 samples, validate on 366493 samples
Step 1
Train on 2931944 samples, validate on 366493 samples
Step 2
Train on 2931944 samples, validate on 366493 samples


In [14]:
mse_hid32_pruned_model = make_pruned_model(max_voc_size=30000, max_emb_dim=100, dnn_hidden=(32, 32), steps=5)

Step 0
Train on 2931944 samples, validate on 366493 samples
Step 1
Train on 2931944 samples, validate on 366493 samples
Step 2
Train on 2931944 samples, validate on 366493 samples
Step 3
Train on 2931944 samples, validate on 366493 samples
Step 4
Train on 2931944 samples, validate on 366493 samples


In [18]:
results = {}
TEACHER_SIZE = 168
TEACHER_AUC = 0.802

In [15]:
mse_emb32_pruned_ans = mse_emb32_pruned_model.predict(test_input, batch_size=BATCH)
tf.keras.models.save_model(mse_emb32_pruned_model, "tmp.h5", include_optimizer=False)
results['mse_emb32_pruned'] = {'auc': roc_auc_score(test[hard_target].values, mse_emb32_pruned_ans),
                        'comprassion_rate': round(TEACHER_SIZE / (os.path.getsize("tmp.h5") / float(2**20)), 4)}

In [16]:
mse_hid32_pruned_ans = mse_hid32_pruned_model.predict(test_input, batch_size=BATCH)
tf.keras.models.save_model(mse_hid32_pruned_model, "tmp.h5", include_optimizer=False)
results['mse_hid32_pruned'] = {'auc': roc_auc_score(test[hard_target].values, mse_hid32_pruned_ans),
                        'comprassion_rate': round(TEACHER_SIZE / (os.path.getsize("tmp.h5") / float(2**20)), 4)}

In [25]:
for key in results:
    tmp = results[key]
    tmp['auc'] = round(tmp['auc'], 4)
    tmp['auc_rate']=round(tmp['auc'] / TEACHER_AUC, 4)
    results[key] = tmp

In [26]:
results

{'mse_emb32_pruned': {'auc': 0.7908,
  'auc_rate': 0.986,
  'comprassion_rate': 59.6502},
 'mse_hid32_pruned': {'auc': 0.7893,
  'auc_rate': 0.9842,
  'comprassion_rate': 41.6854}}