In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]='3'
os.environ["CUDA_LAUNCH_BLOCKING"]='1'

import glob
import pandas as pd
import numpy as np
import cudf
import cupy
import gc
from datetime import datetime

from util import compute_rce_fast

DP = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))>1
DP

False

In [2]:
import cupy as cp
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import nvtabular as nvt
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from nvtabular.framework_utils.torch.models import Model
from nvtabular.framework_utils.torch.utils import process_epoch

import torch
from torch import nn
torch.__version__

'1.7.1+cu101'

## model

In [3]:
class ConcatenatedEmbeddings(torch.nn.Module):
    """Map multiple categorical variables to concatenated embeddings.
    Args:
        embedding_table_shapes: A dictionary mapping column names to
            (cardinality, embedding_size) tuples.
        dropout: A float.
    Inputs:
        x: An int64 Tensor with shape [batch_size, num_variables].
    Outputs:
        A Float Tensor with shape [batch_size, embedding_size_after_concat].
    """

    def __init__(self, embedding_table_shapes, dropout=0.0):
        super().__init__()
        self.embedding_layers = torch.nn.ModuleList(
            [
                torch.nn.Embedding(cat_size, emb_size) #, sparse=(cat_size > 1e5))
                for cat_size, emb_size in embedding_table_shapes.values()
            ]
        )
        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, x):
        if len(x.shape) == 1:
            x = x.unsqueeze(0)
            
        # first two cat columns (a_user and b_user) share same emb table            
        x = [self.embedding_layers[0](x[:,0])] + [layer(x[:, i+1]) for i, layer in enumerate(self.embedding_layers)] 
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return x

In [4]:
import torch.nn as nn

sigmoid = nn.Sigmoid()

class Swish(torch.autograd.Function):
    @staticmethod
    def forward(ctx, i):
        result = i * sigmoid(i)
        ctx.save_for_backward(i)
        return result
    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
        sigmoid_i = sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))

class Swish_Module(nn.Module):
    def forward(self, x):
        return Swish.apply(x)


from transformers import AutoTokenizer, AutoModel

bert_type = 'distilbert-base-multilingual-cased'

tokenizer = AutoTokenizer.from_pretrained(bert_type)

class Net(nn.Module):
    def __init__(self, num_features, layers, embedding_table_shapes, dropout=0.2, bert_type=None, gru_dim=128, emb_dim=768):
        super(Net, self).__init__()
        self.dropout = dropout
        self.initial_cat_layer = ConcatenatedEmbeddings(embedding_table_shapes, dropout=dropout)
        embedding_size = sum(emb_size for _, emb_size in embedding_table_shapes.values())
        layers = [layers] if type(layers) is int else layers
        layers = [num_features + gru_dim + embedding_size + 128 + 128] + layers
        self.use_bert = True
        self.embed = AutoModel.from_pretrained(bert_type).embeddings.word_embeddings  
        assert emb_dim == self.embed.embedding_dim
#             self.reduce_dim = nn.Linear(self.embed.embedding_dim, 256)
#             self.embed = nn.Embedding(119547, emb_dim)
#         layers[0] += gru_dim
        self.lstm = nn.GRU(emb_dim, gru_dim, batch_first=True, bidirectional=False)    
#             self.lstm = nn.Linear(self.embed.embedding_dim, gru_dim)

        self.fn_layers = nn.ModuleList(
                            nn.Sequential(
                                nn.Dropout(p=dropout),
                                nn.Linear(layers[i], layers[i+1]),
                                nn.BatchNorm1d(layers[i+1]),
                                Swish_Module(),
                            )  for i in range(len(layers) -1)
                         )        
        self.fn_last = nn.Linear(layers[-1],4)
        
    def forward(self, x_cat, x_cont, bert_tok):
        a_emb = self.initial_cat_layer.embedding_layers[0](x_cat[:,0])
        b_emb = self.initial_cat_layer.embedding_layers[0](x_cat[:,1])
        mf = a_emb * b_emb        
        
        x_cat = self.initial_cat_layer(x_cat)
        bert_tok = self.embed(bert_tok)#.mean(dim=1)
#             bert_tok = self.reduce_dim(bert_tok)
        lstm_out = self.lstm(bert_tok)[0][:,-1]
        output = torch.cat([x_cont, lstm_out, x_cat, mf],dim=1)
        for layer in self.fn_layers:
            output = layer(output)
        logit = self.fn_last(output)
        return logit

## scheduler

In [5]:
from warmup_scheduler import GradualWarmupScheduler
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import warnings; warnings.simplefilter('ignore')

def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
class GradualWarmupSchedulerV2(GradualWarmupScheduler):
    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        super(GradualWarmupSchedulerV2, self).__init__(optimizer, multiplier, total_epoch, after_scheduler)
    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]
        if self.multiplier == 1.0:
            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
        else:
            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs] 

## train loop

In [6]:
criterion = nn.BCEWithLogitsLoss()

def train_epoch(model, loader, optimizer, scaler):

    model.train()
    train_loss = []
    bar = tqdm(loader)
    for batch in bar:
        x_cat, x_cont, text_tok, targets = batch
        
        x_cat = x_cat.cuda()
        x_cont = x_cont.cuda()
        text_tok = text_tok.cuda()
        targets = targets.cuda()

        optimizer.zero_grad()
#         optimizer2.zero_grad()

        if use_torch_amp:
            with amp.autocast():
                logits = model(x_cat, x_cont, text_tok)
#                 logits = model(data)
            loss = criterion(logits, targets)       
            
            scaler.scale(loss).backward()

            # You can choose which optimizers receive explicit unscaling, if you
            # want to inspect or modify the gradients of the params they own.
            scaler.unscale_(optimizer)
#             scaler.unscale_(optimizer2)

            scaler.step(optimizer)
#             scaler.step(optimizer2)

            scaler.update()            
            
        elif use_amp:
            logits = model(x_cat, x_cont, text_tok)
#             logits = model(data)
            loss = criterion(logits, targets)
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()
        else:
            logits = model(x_cat, x_cont, text_tok)
#             logits = model(data)
            loss = criterion(logits, targets)
            loss.backward()
            optimizer.step()

        loss_np = loss.item()
        train_loss.append(loss_np)
        smooth_loss = sum(train_loss[-50:]) / min(len(train_loss), 50)
        bar.set_description('loss: %.4f, smth: %.4f' % (loss_np, smooth_loss))

    return np.mean(train_loss)

def valid_epoch(model, loader):

    model.eval()
    val_loss = []
    LOGITS = []
    TARGETS = []
    with torch.no_grad():
        for batch in tqdm(loader):
            x_cat, x_cont, text_tok, targets = batch

            x_cat = x_cat.cuda()
            x_cont = x_cont.cuda()
            text_tok = text_tok.cuda()
            targets = targets.cuda()
        
            logits = model(x_cat, x_cont, text_tok)
#             logits = model(data)
            loss = criterion(logits, targets)
            val_loss.append(loss.item())
            LOGITS.append(logits.cpu())
            TARGETS.append(targets.cpu())
            
    LOGITS = torch.cat(LOGITS)
    TARGETS = torch.cat(TARGETS)
    rce = {}
    for i in range(4):
        rce[label_names[i]] = compute_rce_fast(cp.asarray(LOGITS[:,i].sigmoid()),cp.asarray(TARGETS[:,i])).get()            
    mean_rce = np.mean([v for k,v in rce.items()])
            
    val_loss = np.mean(val_loss)

    return val_loss, rce, mean_rce

# NVT loader

In [7]:
label_names = sorted(['reply', 'retweet', 'retweet_comment', 'like'])
CAT_COLUMNS = ['a_user_id','b_user_id','language','media','tweet_type']
NUMERIC_COLUMNS = ['a_follower_count',
                     'a_following_count',
                     'a_is_verified',
                     'b_follower_count',
                     'b_following_count',
                     'b_is_verified',
                     'b_follows_a',
                     'tw_len_media',
                     'tw_len_photo',
                     'tw_len_video',
                     'tw_len_gif',
                     'tw_len_quest',
                     'tw_len_token',
                     'tw_count_capital_words',
                     'tw_count_excl_quest_marks',
                     'tw_count_special1',
                     'tw_count_hash',
                     'tw_last_quest',
                     'tw_len_retweet',
                     'tw_len_rt',
                     'tw_count_at',
                     'tw_count_words',
                     'tw_count_char',
                     'tw_rt_count_words',
                     'tw_rt_count_char',
                     'len_hashtags',
                     'len_links',
                     'len_domains',
                     'a_ff_rate',
                     'b_ff_rate',
                     'ab_fing_rate',
                     'ab_fer_rate',
                     'a_age',
                     'b_age',
                     'ab_age_dff',
                     'ab_age_rate']
len(NUMERIC_COLUMNS)

36

In [8]:
def read_norm_merge(path, split='train'):
    ddf = pd.read_parquet(path)

    ddf['quantile'] = 0
    quantiles = [92, 216, 442, 1064]
    for i, quant in enumerate(quantiles):
        ddf['quantile'] = (ddf['quantile']+(ddf['a_follower_count']>quant).astype('int8')).astype('int8')

    ddf['date'] = pd.to_datetime(ddf['timestamp'], unit='s')
    
    VALID_DOW = '2021-02-18'
    if split=='train':
        ddf = ddf[ddf['date']<pd.to_datetime(VALID_DOW)].reset_index(drop=True)
    elif split=='valid':
        ddf = ddf[ddf['date']>=pd.to_datetime(VALID_DOW)].reset_index(drop=True)    
    else:
        pass
    
    ddf['a_ff_rate'] = (ddf['a_following_count'] / ddf['a_follower_count']).astype('float32')
    ddf['b_ff_rate'] = (ddf['b_follower_count']  / ddf['b_following_count']).astype('float32')
    ddf['ab_fing_rate'] = (ddf['a_following_count'] / ddf['b_following_count']).astype('float32')
    ddf['ab_fer_rate'] = (ddf['a_follower_count'] / (1+ddf['b_follower_count'])).astype('float32')
    ddf['a_age'] = ddf['a_account_creation'].astype('int16') + 128
    ddf['b_age'] = ddf['b_account_creation'].astype('int16') + 128
    ddf['ab_age_dff'] = ddf['b_age'] - ddf['a_age']
    ddf['ab_age_rate'] = ddf['a_age']/(1+ddf['b_age'])

    ## Normalize
    for col in NUMERIC_COLUMNS:
        if col == 'tw_len_quest':
            ddf[col] = np.clip(ddf[col].values,0,None)
        if ddf[col].dtype == 'uint16':
            ddf[col].astype('int32')

        if col == 'ab_age_dff':
            ddf[col] = ddf[col] / 256.            
        elif 'int' in str(ddf[col].dtype) or 'float' in str(ddf[col].dtype):    
            ddf[col] = np.log1p(ddf[col])

        if ddf[col].dtype == 'float64':
            ddf[col] = ddf[col].astype('float32') 

    ## get categorical embedding id        
    for col in CAT_COLUMNS:
        ddf[col] = ddf[col].astype('float')
        if col in ['a_user_id','b_user_id']:
            mapping_col = 'a_user_id_b_user_id'
        else:
            mapping_col = col
        mapping = pd.read_parquet(f'/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr25/categories/unique.{mapping_col}.parquet').reset_index()
        mapping.columns = ['index',col]
        ddf = ddf.merge(mapping, how='left', on=col).drop(columns=[col]).rename(columns={'index':col})
        ddf[col] = ddf[col].fillna(0).astype('int')        

    label_names = ['reply', 'retweet', 'retweet_comment', 'like']
    DONT_USE = ['timestamp','a_account_creation','b_account_creation','engage_time',
                'fold', 'dt_dow', 'a_account_creation', 
                'b_account_creation', 'elapsed_time', 'links','domains','hashtags','id', 'date', 'is_train', 
                'tw_hash0', 'tw_hash1', 'tw_hash2', 'tw_http0', 'tw_uhash', 'tw_hash', 'tw_word0', 
                'tw_word1', 'tw_word2', 'tw_word3', 'tw_word4', 'dt_minute', 'dt_second',
               'dt_day', 'group', 'text', 'tweet_id', 'tw_original_user0', 'tw_original_user1', 'tw_original_user2',
                'tw_rt_user0', 'tw_original_http0', 'tw_tweet',]
    DONT_USE = [c for c in ddf.columns if c in DONT_USE]
    gc.collect(); gc.collect()
    
    return ddf.drop(columns=DONT_USE)

In [16]:
PATHS = sorted(glob.glob('/raid/recsys/train_proc3/*.parquet'))
len(PATHS)

232

In [10]:
# for col in NUMERIC_COLUMNS:
#     print(col)
#     plt.hist(train[col].values, bins=50)
#     plt.title(col)
# #     print(ddf[col].describe())
#     plt.show()

In [8]:
import torch
from torch.utils.data import Dataset,DataLoader

class AllDataset(Dataset):
    def __init__(self, df, max_len_txt, NUMERIC_COLUMNS, CAT_COLUMNS):
        self.X = df[NUMERIC_COLUMNS].values
        self.X_cat = df[CAT_COLUMNS].values
        self.labels = df[label_names].values
        self.text_tokens = df.text_tokens.values
        self.max_len_txt = max_len_txt
    def __len__(self):
        return self.labels.shape[0]
    def __getitem__(self, index):        
#         text = tokenizer.decode([int(token_id) for token_id in self.text_tokens[index][4:-4].split('\t')]) # [4:-4] is to remove [CLS] and [SEP]
#         inputs = tokenizer(text, truncation=True, padding='max_length', max_length=max_len_txt, return_tensors='pt')['input_ids'].squeeze()
        inputs = [int(token_id) for token_id in self.text_tokens[index].split('\t')][:self.max_len_txt]
        if len(inputs) < self.max_len_txt:
            inputs += [0]*(self.max_len_txt-len(inputs))
        return self.X_cat[index], self.X[index].astype(np.float32), torch.tensor(inputs), self.labels[index].astype(np.float32)

In [9]:
gru_dim=128
max_len_txt=48
emb_dim=768
lr = 1e-4 
ep = 46   
BATCH_SIZE = 1024
num_workers = 16
use_torch_amp = True
import torch.cuda.amp as amp
use_amp = False

model_name = 'MF_len48_joint_thr25_3weeks'

In [10]:
len(NUMERIC_COLUMNS)

36

In [14]:
%%time
train_lst = []
for path in PATHS[:10]:
    train_lst.append(read_norm_merge(path, 'valid'))
valid = pd.concat(train_lst)
gc.collect()

valid_dataset = AllDataset(valid, max_len_txt, NUMERIC_COLUMNS, CAT_COLUMNS)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers) 
valid.shape, len(valid_loader)

CPU times: user 4min 58s, sys: 49.7 s, total: 5min 47s
Wall time: 5min 35s


((10324907, 47), 10083)

In [15]:
# train_parts_order = np.concatenate([np.random.permutation(232)])
train_parts_order = np.array([ 46, 111, 208, 230,   3,  22, 227, 153,  78,  52,  20, 185,   6,
        130, 177,  83,  97, 194,  24, 187,  93,  59, 217, 180, 129,  62,
          1,  43, 229, 102, 196,  50,   4,  12, 114,  70,  18,  91,  71,
        190, 174,  23,  63,  89, 188,  16, 104,  67,  39, 225, 176,  28,
        198,   2,  76, 166, 216, 116, 199, 113, 107, 201,  64, 115,   8,
        171,  44, 218, 158, 181,  79,  47, 155, 159, 164, 109,  56, 106,
        122, 203, 144,  14, 163, 124, 110, 126,  80,  77,  94, 135,  33,
        134, 224, 145, 172, 191,  60, 148, 215, 212, 219,  35, 167,  37,
        132, 182, 228,  75,  87, 156, 137,  74,  29,  95, 118,  90, 222,
         19,  57, 162, 105, 223, 210, 140,  10,  72, 152, 183, 170,  51,
         82, 117,  13, 211, 120,  81, 160,  27, 200, 128, 169, 213, 179,
         42,  11, 143,  15, 209, 151,  48, 207, 112, 119, 231, 175,   0,
        146, 154,  68, 197,  21, 206, 125, 192,  31,  86, 138,  36, 108,
        103,  58, 142,  54,  98,  99, 127, 214,   7,  92, 121, 202, 141,
        150,  88,  53,  38, 139, 147, 131,  66,  40,  26, 123,  73, 100,
        165, 186, 149, 205,   5, 189,  25,  32, 133, 101, 204, 178, 193,
        136,  84, 161,  30, 221,  65,  85,  41,  17,  61,  45, 173, 195,
          9, 184,  55,  49, 168,  69,  34,  96, 157, 226, 220])
train_parts_order, train_parts_order.shape

(array([ 46, 111, 208, 230,   3,  22, 227, 153,  78,  52,  20, 185,   6,
        130, 177,  83,  97, 194,  24, 187,  93,  59, 217, 180, 129,  62,
          1,  43, 229, 102, 196,  50,   4,  12, 114,  70,  18,  91,  71,
        190, 174,  23,  63,  89, 188,  16, 104,  67,  39, 225, 176,  28,
        198,   2,  76, 166, 216, 116, 199, 113, 107, 201,  64, 115,   8,
        171,  44, 218, 158, 181,  79,  47, 155, 159, 164, 109,  56, 106,
        122, 203, 144,  14, 163, 124, 110, 126,  80,  77,  94, 135,  33,
        134, 224, 145, 172, 191,  60, 148, 215, 212, 219,  35, 167,  37,
        132, 182, 228,  75,  87, 156, 137,  74,  29,  95, 118,  90, 222,
         19,  57, 162, 105, 223, 210, 140,  10,  72, 152, 183, 170,  51,
         82, 117,  13, 211, 120,  81, 160,  27, 200, 128, 169, 213, 179,
         42,  11, 143,  15, 209, 151,  48, 207, 112, 119, 231, 175,   0,
        146, 154,  68, 197,  21, 206, 125, 192,  31,  86, 138,  36, 108,
        103,  58, 142,  54,  98,  99, 127, 214,   7

In [11]:
model = Net(len(NUMERIC_COLUMNS), layers=[1024,256,64], 
            embedding_table_shapes={'a_user_id_b_user_id': (8244536, 128), 'language': (67, 16), 'media': (15, 16), 'tweet_type': (4, 16)},
            bert_type=bert_type).cuda()

for param in model.embed.parameters():
    param.requires_grad = False

model    

Net(
  (initial_cat_layer): ConcatenatedEmbeddings(
    (embedding_layers): ModuleList(
      (0): Embedding(8244536, 128)
      (1): Embedding(67, 16)
      (2): Embedding(15, 16)
      (3): Embedding(4, 16)
    )
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (embed): Embedding(119547, 768, padding_idx=0)
  (lstm): GRU(768, 128, batch_first=True)
  (fn_layers): ModuleList(
    (0): Sequential(
      (0): Dropout(p=0.2, inplace=False)
      (1): Linear(in_features=596, out_features=1024, bias=True)
      (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Swish_Module()
    )
    (1): Sequential(
      (0): Dropout(p=0.2, inplace=False)
      (1): Linear(in_features=1024, out_features=256, bias=True)
      (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Swish_Module()
    )
    (2): Sequential(
      (0): Dropout(p=0.2, inplace=False)
      (1): Linear(in_features=256, out_features=64, b

In [17]:
optimizer = optim.AdamW(model.parameters(), lr=lr)
scaler = amp.GradScaler() if use_torch_amp else None

scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, ep-1)
scheduler_warmup = GradualWarmupSchedulerV2(optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine)

rce_best = 0    

## start training

In [17]:
print(model_name)

MF_len48_joint_thr25_3weeks


In [19]:
for epoch in range(1, ep+1):
    print(time.ctime(), 'Epoch:', epoch)
    scheduler_warmup.step(epoch-1) 
    
    # 5 parts per epoch
    idx_this_ep = train_parts_order[(epoch*5-5):epoch*5]
    
    train_lst = []
    for idx in idx_this_ep:
        train_lst.append(read_norm_merge(PATHS[idx], 'train' if idx<10 else 'both').to_pandas())
    train = pd.concat(train_lst)
 
    gc.collect();gc.collect();
    
    train_dataset = AllDataset(train, max_len_txt, NUMERIC_COLUMNS, CAT_COLUMNS)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers, drop_last=True) 
    
    train_loss = train_epoch(model, train_loader, optimizer, scaler)
    valid_loss,rce,mean_rce = valid_epoch(model, valid_loader)
   
    content = time.ctime() + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.4f}, valid loss: {valid_loss:.4f}, mean_rce: {mean_rce:.2f}'
    for col in ['retweet', 'reply',  'like', 'retweet_comment']:
        content += f', {col}: {rce[col]:.2f}'
        
    print(content)
    
    if mean_rce > rce_best:
        print('rce_best increased ({:.6f} --> {:.6f}).  Saving model ...'.format(rce_best, mean_rce))
        rce_best = mean_rce
                
        torch.save(model.state_dict(), f'../models/{model_name}_best.pth')
        
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scaler_state_dict': scaler.state_dict() if scaler else None,      
            'rce_best': rce_best,
        },
        f'../models/{model_name}_last.pth'
    )            
        
torch.save(model.state_dict(), f'../models/{model_name}_final.pth')        

Sun May 30 00:24:52 2021 Epoch: 1


loss: 0.2464, smth: 0.2570: 100%|██████████| 12298/12298 [51:57<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:00<00:00, 83.97it/s]


Sun May 30 01:19:18 2021 Epoch 1, lr: 0.0001000, train loss: 0.2837, valid loss: 0.2565, mean_rce: 8.31, retweet: 9.04, reply: 11.18, like: 10.27, retweet_comment: 2.74
rce_best increased (0.000000 --> 8.305332).  Saving model ...
Sun May 30 01:21:23 2021 Epoch: 2


MemoryError: std::bad_alloc: CUDA error at: /home/bo/anaconda3/envs/rapids19/include/rmm/mr/device/cuda_memory_resource.hpp:69: cudaErrorMemoryAllocation out of memory

### change to pd loading

In [18]:
        sd = torch.load(f'../models/{model_name}_last.pth', map_location='cpu')
        from_epoch = sd['epoch']
        sd['model_state_dict'] = {k[7:] if k.startswith('module.') else k: sd['model_state_dict'][k] for k in sd['model_state_dict'].keys()}
        model.load_state_dict(sd['model_state_dict'], strict=True)
        optimizer.load_state_dict(sd['optimizer_state_dict'])
        scaler.load_state_dict(sd['scaler_state_dict'])

In [19]:
for epoch in range(1, ep+1):
    print(time.ctime(), 'Epoch:', epoch)
    scheduler_warmup.step(epoch-1) 
    
    if epoch<=1: continue
    
    # 5 parts per epoch
    idx_this_ep = train_parts_order[(epoch*5-5):epoch*5]
    
    train_lst = []
    for idx in tqdm(idx_this_ep):
        train_lst.append(read_norm_merge(PATHS[idx], 'train' if idx<10 else 'both'))
    train = pd.concat(train_lst)
 
    gc.collect();gc.collect();
    
    train_dataset = AllDataset(train, max_len_txt, NUMERIC_COLUMNS, CAT_COLUMNS)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers, drop_last=True) 
    
    train_loss = train_epoch(model, train_loader, optimizer, scaler)
    valid_loss,rce,mean_rce = valid_epoch(model, valid_loader)
   
    content = time.ctime() + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.4f}, valid loss: {valid_loss:.4f}, mean_rce: {mean_rce:.2f}'
    for col in ['retweet', 'reply',  'like', 'retweet_comment']:
        content += f', {col}: {rce[col]:.2f}'
        
    print(content)
    
    if mean_rce > rce_best:
        print('rce_best increased ({:.6f} --> {:.6f}).  Saving model ...'.format(rce_best, mean_rce))
        rce_best = mean_rce
                
        torch.save(model.state_dict(), f'../models/{model_name}_best.pth')
        
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scaler_state_dict': scaler.state_dict() if scaler else None,      
            'rce_best': rce_best,
        },
        f'../models/{model_name}_last.pth'
    )            
        
torch.save(model.state_dict(), f'../models/{model_name}_final.pth')        

  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 01:38:04 2021 Epoch: 1
Sun May 30 01:38:04 2021 Epoch: 2


100%|██████████| 5/5 [03:59<00:00, 48.00s/it]
loss: 0.2467, smth: 0.2446: 100%|██████████| 13575/13575 [57:17<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:05<00:00, 80.60it/s]


Sun May 30 02:41:33 2021 Epoch 2, lr: 0.0010000, train loss: 0.2498, valid loss: 0.2447, mean_rce: 11.53, retweet: 13.58, reply: 13.88, like: 14.72, retweet_comment: 3.94
rce_best increased (0.000000 --> 11.530009).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 02:43:40 2021 Epoch: 3


100%|██████████| 5/5 [04:00<00:00, 48.18s/it]
loss: 0.2450, smth: 0.2370: 100%|██████████| 13671/13671 [57:44<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:07<00:00, 79.02it/s]


Sun May 30 03:47:40 2021 Epoch 3, lr: 0.0010000, train loss: 0.2397, valid loss: 0.2374, mean_rce: 13.49, retweet: 16.71, reply: 15.07, like: 17.41, retweet_comment: 4.76
rce_best increased (11.530009 --> 13.488159).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 03:49:47 2021 Epoch: 4


100%|██████████| 5/5 [04:11<00:00, 50.28s/it]
loss: 0.2195, smth: 0.2332: 100%|██████████| 14636/14636 [1:01:50<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:07<00:00, 78.99it/s]


Sun May 30 04:58:04 2021 Epoch 4, lr: 0.0009951, train loss: 0.2337, valid loss: 0.2334, mean_rce: 14.70, retweet: 18.69, reply: 15.56, like: 18.77, retweet_comment: 5.75
rce_best increased (13.488159 --> 14.695095).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 05:00:11 2021 Epoch: 5


100%|██████████| 5/5 [04:08<00:00, 49.60s/it]
loss: 0.2316, smth: 0.2293: 100%|██████████| 14331/14331 [1:00:32<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.63it/s]


Sun May 30 06:07:08 2021 Epoch 5, lr: 0.0009891, train loss: 0.2298, valid loss: 0.2298, mean_rce: 15.78, retweet: 20.46, reply: 15.97, like: 19.98, retweet_comment: 6.70
rce_best increased (14.695095 --> 15.775962).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 06:09:17 2021 Epoch: 6


100%|██████████| 5/5 [03:44<00:00, 44.92s/it]
loss: 0.2366, smth: 0.2271: 100%|██████████| 12543/12543 [53:00<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.67it/s]


Sun May 30 07:08:17 2021 Epoch 6, lr: 0.0009806, train loss: 0.2270, valid loss: 0.2261, mean_rce: 16.87, retweet: 21.38, reply: 17.27, like: 21.50, retweet_comment: 7.32
rce_best increased (15.775962 --> 16.870298).  Saving model ...
Sun May 30 07:10:23 2021 Epoch: 7


100%|██████████| 5/5 [04:02<00:00, 48.49s/it]
loss: 0.2181, smth: 0.2245: 100%|██████████| 13903/13903 [58:43<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:09<00:00, 78.15it/s]


Sun May 30 08:15:26 2021 Epoch 7, lr: 0.0009698, train loss: 0.2251, valid loss: 0.2243, mean_rce: 17.45, retweet: 22.12, reply: 17.86, like: 22.15, retweet_comment: 7.68
rce_best increased (16.870298 --> 17.450060).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 08:17:33 2021 Epoch: 8


100%|██████████| 5/5 [04:09<00:00, 49.85s/it]
loss: 0.2306, smth: 0.2205: 100%|██████████| 14638/14638 [1:01:57<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:10<00:00, 77.29it/s]


Sun May 30 09:25:58 2021 Epoch 8, lr: 0.0009568, train loss: 0.2228, valid loss: 0.2225, mean_rce: 18.17, retweet: 23.15, reply: 18.67, like: 22.55, retweet_comment: 8.31
rce_best increased (17.450060 --> 18.168774).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 09:28:07 2021 Epoch: 9


100%|██████████| 5/5 [04:15<00:00, 51.07s/it]
loss: 0.2264, smth: 0.2212: 100%|██████████| 14635/14635 [1:01:54<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.46it/s]


Sun May 30 10:36:34 2021 Epoch 9, lr: 0.0009415, train loss: 0.2211, valid loss: 0.2204, mean_rce: 18.77, retweet: 23.59, reply: 19.33, like: 23.43, retweet_comment: 8.74
rce_best increased (18.168774 --> 18.771233).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 10:38:41 2021 Epoch: 10


100%|██████████| 5/5 [03:55<00:00, 47.01s/it]
loss: 0.2101, smth: 0.2173: 100%|██████████| 13865/13865 [58:22<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.77it/s]


Sun May 30 11:43:15 2021 Epoch 10, lr: 0.0009240, train loss: 0.2196, valid loss: 0.2197, mean_rce: 19.19, retweet: 24.07, reply: 19.90, like: 23.50, retweet_comment: 9.28
rce_best increased (18.771233 --> 19.188765).  Saving model ...
Sun May 30 11:45:23 2021 Epoch: 11


100%|██████████| 5/5 [03:58<00:00, 47.63s/it]
loss: 0.2078, smth: 0.2190: 100%|██████████| 13673/13673 [57:36<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.24it/s]


Sun May 30 12:49:13 2021 Epoch 11, lr: 0.0009045, train loss: 0.2186, valid loss: 0.2181, mean_rce: 19.71, retweet: 24.38, reply: 20.46, like: 24.22, retweet_comment: 9.76
rce_best increased (19.188765 --> 19.705486).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 12:51:21 2021 Epoch: 12


100%|██████████| 5/5 [04:03<00:00, 48.77s/it]
loss: 0.2238, smth: 0.2156: 100%|██████████| 14359/14359 [1:00:30<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.67it/s]


Sun May 30 13:58:11 2021 Epoch 12, lr: 0.0008830, train loss: 0.2173, valid loss: 0.2163, mean_rce: 20.25, retweet: 24.93, reply: 20.96, like: 24.93, retweet_comment: 10.17
rce_best increased (19.705486 --> 20.247375).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 14:00:19 2021 Epoch: 13


100%|██████████| 5/5 [03:56<00:00, 47.24s/it]
loss: 0.2033, smth: 0.2154: 100%|██████████| 13714/13714 [57:46<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.75it/s]


Sun May 30 15:04:17 2021 Epoch 13, lr: 0.0008597, train loss: 0.2162, valid loss: 0.2156, mean_rce: 20.51, retweet: 25.19, reply: 21.17, like: 25.14, retweet_comment: 10.53
rce_best increased (20.247375 --> 20.510010).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 15:06:25 2021 Epoch: 14


100%|██████████| 5/5 [04:01<00:00, 48.24s/it]
loss: 0.2018, smth: 0.2147: 100%|██████████| 14312/14312 [1:00:17<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:07<00:00, 78.86it/s]


Sun May 30 16:12:59 2021 Epoch 14, lr: 0.0008346, train loss: 0.2153, valid loss: 0.2135, mean_rce: 21.08, retweet: 25.93, reply: 21.43, like: 26.00, retweet_comment: 10.98
rce_best increased (20.510010 --> 21.083277).  Saving model ...
Sun May 30 16:15:06 2021 Epoch: 15


100%|██████████| 5/5 [04:05<00:00, 49.03s/it]
loss: 0.2253, smth: 0.2138: 100%|██████████| 14636/14636 [1:01:39<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:09<00:00, 78.16it/s]


Sun May 30 17:23:07 2021 Epoch 15, lr: 0.0008078, train loss: 0.2143, valid loss: 0.2129, mean_rce: 21.25, retweet: 25.85, reply: 21.68, like: 26.31, retweet_comment: 11.16
rce_best increased (21.083277 --> 21.248219).  Saving model ...
Sun May 30 17:25:13 2021 Epoch: 16


100%|██████████| 5/5 [04:00<00:00, 48.18s/it]
loss: 0.2282, smth: 0.2123: 100%|██████████| 14522/14522 [1:01:22<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:11<00:00, 76.78it/s]


Sun May 30 18:32:55 2021 Epoch 16, lr: 0.0007796, train loss: 0.2136, valid loss: 0.2115, mean_rce: 21.69, retweet: 26.39, reply: 22.15, like: 26.83, retweet_comment: 11.41
rce_best increased (21.248219 --> 21.692936).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 18:35:01 2021 Epoch: 17


100%|██████████| 5/5 [04:16<00:00, 51.36s/it]
loss: 0.2185, smth: 0.2144: 100%|██████████| 14856/14856 [1:03:05<00:00,  3.92it/s]
100%|██████████| 10083/10083 [02:09<00:00, 77.79it/s]


Sun May 30 19:44:40 2021 Epoch 17, lr: 0.0007500, train loss: 0.2128, valid loss: 0.2108, mean_rce: 21.96, retweet: 26.63, reply: 22.46, like: 27.02, retweet_comment: 11.72
rce_best increased (21.692936 --> 21.956787).  Saving model ...
Sun May 30 19:46:46 2021 Epoch: 18


100%|██████████| 5/5 [04:13<00:00, 50.62s/it]
loss: 0.2067, smth: 0.2110: 100%|██████████| 14637/14637 [1:02:06<00:00,  3.93it/s]
100%|██████████| 10083/10083 [02:09<00:00, 77.79it/s]


Sun May 30 20:55:23 2021 Epoch 18, lr: 0.0007192, train loss: 0.2121, valid loss: 0.2099, mean_rce: 22.15, retweet: 26.96, reply: 22.52, like: 27.44, retweet_comment: 11.70
rce_best increased (21.956787 --> 22.152912).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 20:57:32 2021 Epoch: 19


100%|██████████| 5/5 [03:59<00:00, 47.85s/it]
loss: 0.2001, smth: 0.2134: 100%|██████████| 13821/13821 [58:35<00:00,  3.93it/s]
100%|██████████| 10083/10083 [02:11<00:00, 76.44it/s]


Sun May 30 22:02:29 2021 Epoch 19, lr: 0.0006873, train loss: 0.2113, valid loss: 0.2093, mean_rce: 22.38, retweet: 27.16, reply: 22.54, like: 27.66, retweet_comment: 12.16
rce_best increased (22.152912 --> 22.378281).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Sun May 30 22:04:37 2021 Epoch: 20


100%|██████████| 5/5 [04:07<00:00, 49.48s/it]
loss: 0.2110, smth: 0.2111: 100%|██████████| 14182/14182 [1:00:08<00:00,  3.93it/s]
100%|██████████| 10083/10083 [02:09<00:00, 77.93it/s]


Sun May 30 23:11:10 2021 Epoch 20, lr: 0.0006545, train loss: 0.2108, valid loss: 0.2083, mean_rce: 22.74, retweet: 27.20, reply: 23.19, like: 28.12, retweet_comment: 12.44
rce_best increased (22.378281 --> 22.737848).  Saving model ...
Sun May 30 23:13:16 2021 Epoch: 21


100%|██████████| 5/5 [04:03<00:00, 48.63s/it]
loss: 0.2188, smth: 0.2090: 100%|██████████| 14296/14296 [1:00:38<00:00,  3.93it/s]
100%|██████████| 10083/10083 [02:11<00:00, 76.96it/s]


Mon May 31 00:20:17 2021 Epoch 21, lr: 0.0006210, train loss: 0.2102, valid loss: 0.2076, mean_rce: 22.94, retweet: 27.44, reply: 23.39, like: 28.34, retweet_comment: 12.60
rce_best increased (22.737848 --> 22.941046).  Saving model ...
Mon May 31 00:22:25 2021 Epoch: 22


100%|██████████| 5/5 [04:00<00:00, 48.10s/it]
loss: 0.2082, smth: 0.2096: 100%|██████████| 13550/13550 [57:11<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:07<00:00, 79.06it/s]


Mon May 31 01:25:53 2021 Epoch 22, lr: 0.0005868, train loss: 0.2097, valid loss: 0.2067, mean_rce: 23.27, retweet: 27.88, reply: 23.63, like: 28.64, retweet_comment: 12.93
rce_best increased (22.941046 --> 23.271320).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 01:28:00 2021 Epoch: 23


100%|██████████| 5/5 [04:06<00:00, 49.26s/it]
loss: 0.2005, smth: 0.2070: 100%|██████████| 14635/14635 [1:01:38<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:07<00:00, 78.94it/s]


Mon May 31 02:36:00 2021 Epoch 23, lr: 0.0005523, train loss: 0.2092, valid loss: 0.2065, mean_rce: 23.28, retweet: 27.69, reply: 23.64, like: 28.83, retweet_comment: 12.95
rce_best increased (23.271320 --> 23.277933).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 02:38:08 2021 Epoch: 24


100%|██████████| 5/5 [04:00<00:00, 48.00s/it]
loss: 0.2226, smth: 0.2096: 100%|██████████| 14180/14180 [59:42<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:06<00:00, 79.54it/s]


Mon May 31 03:44:05 2021 Epoch 24, lr: 0.0005174, train loss: 0.2086, valid loss: 0.2054, mean_rce: 23.65, retweet: 28.07, reply: 24.08, like: 29.24, retweet_comment: 13.23
rce_best increased (23.277933 --> 23.654835).  Saving model ...
Mon May 31 03:46:12 2021 Epoch: 25


100%|██████████| 5/5 [03:55<00:00, 47.10s/it]
loss: 0.2100, smth: 0.2069: 100%|██████████| 13862/13862 [58:22<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.37it/s]


Mon May 31 04:50:47 2021 Epoch 25, lr: 0.0004826, train loss: 0.2082, valid loss: 0.2049, mean_rce: 23.77, retweet: 28.39, reply: 24.13, like: 29.35, retweet_comment: 13.23
rce_best increased (23.654835 --> 23.773729).  Saving model ...
Mon May 31 04:52:54 2021 Epoch: 26


100%|██████████| 5/5 [04:08<00:00, 49.63s/it]
loss: 0.2144, smth: 0.2080: 100%|██████████| 14634/14634 [1:01:37<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.30it/s]


Mon May 31 06:00:56 2021 Epoch 26, lr: 0.0004477, train loss: 0.2078, valid loss: 0.2046, mean_rce: 23.84, retweet: 28.24, reply: 24.27, like: 29.61, retweet_comment: 13.22
rce_best increased (23.773729 --> 23.835400).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 06:03:02 2021 Epoch: 27


100%|██████████| 5/5 [04:04<00:00, 48.81s/it]
loss: 0.2163, smth: 0.2081: 100%|██████████| 14671/14671 [1:01:46<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:09<00:00, 78.04it/s]


Mon May 31 07:11:10 2021 Epoch 27, lr: 0.0004132, train loss: 0.2073, valid loss: 0.2039, mean_rce: 24.16, retweet: 28.69, reply: 24.47, like: 29.77, retweet_comment: 13.71
rce_best increased (23.835400 --> 24.157763).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 07:13:18 2021 Epoch: 28


100%|██████████| 5/5 [04:04<00:00, 48.90s/it]
loss: 0.2163, smth: 0.2063: 100%|██████████| 14626/14626 [1:01:34<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.18it/s]


Mon May 31 08:21:14 2021 Epoch 28, lr: 0.0003790, train loss: 0.2067, valid loss: 0.2036, mean_rce: 24.27, retweet: 28.76, reply: 24.67, like: 29.84, retweet_comment: 13.80
rce_best increased (24.157763 --> 24.268223).  Saving model ...
Mon May 31 08:23:20 2021 Epoch: 29


100%|██████████| 5/5 [04:07<00:00, 49.46s/it]
loss: 0.2039, smth: 0.2065: 100%|██████████| 14668/14668 [1:01:44<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:09<00:00, 77.92it/s]


Mon May 31 09:31:29 2021 Epoch 29, lr: 0.0003455, train loss: 0.2065, valid loss: 0.2026, mean_rce: 24.54, retweet: 29.02, reply: 24.85, like: 30.26, retweet_comment: 14.03
rce_best increased (24.268223 --> 24.540142).  Saving model ...
Mon May 31 09:33:34 2021 Epoch: 30


100%|██████████| 5/5 [04:05<00:00, 49.18s/it]
loss: 0.2161, smth: 0.2057: 100%|██████████| 14645/14645 [1:01:39<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.27it/s]


Mon May 31 10:41:36 2021 Epoch 30, lr: 0.0003127, train loss: 0.2061, valid loss: 0.2021, mean_rce: 24.70, retweet: 29.19, reply: 25.12, like: 30.45, retweet_comment: 14.01
rce_best increased (24.540142 --> 24.696213).  Saving model ...
Mon May 31 10:43:41 2021 Epoch: 31


100%|██████████| 5/5 [03:43<00:00, 44.74s/it]
loss: 0.2006, smth: 0.2047: 100%|██████████| 13018/13018 [54:50<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.56it/s]


Mon May 31 11:44:31 2021 Epoch 31, lr: 0.0002808, train loss: 0.2056, valid loss: 0.2020, mean_rce: 24.67, retweet: 29.15, reply: 24.98, like: 30.54, retweet_comment: 14.00
Mon May 31 11:46:03 2021 Epoch: 32


100%|██████████| 5/5 [03:57<00:00, 47.51s/it]
loss: 0.2135, smth: 0.2046: 100%|██████████| 13865/13865 [58:26<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:09<00:00, 77.90it/s]


Mon May 31 12:50:44 2021 Epoch 32, lr: 0.0002500, train loss: 0.2049, valid loss: 0.2013, mean_rce: 24.95, retweet: 29.52, reply: 25.32, like: 30.75, retweet_comment: 14.21
rce_best increased (24.696213 --> 24.949640).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 12:52:49 2021 Epoch: 33


100%|██████████| 5/5 [04:01<00:00, 48.32s/it]
loss: 0.1938, smth: 0.2051: 100%|██████████| 14470/14470 [1:00:59<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:09<00:00, 78.02it/s]


Mon May 31 14:00:07 2021 Epoch 33, lr: 0.0002204, train loss: 0.2048, valid loss: 0.2007, mean_rce: 25.12, retweet: 29.59, reply: 25.49, like: 31.02, retweet_comment: 14.39
rce_best increased (24.949640 --> 25.124504).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 14:02:13 2021 Epoch: 34


100%|██████████| 5/5 [04:04<00:00, 49.00s/it]
loss: 0.2127, smth: 0.2047: 100%|██████████| 14637/14637 [1:01:38<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.31it/s]


Mon May 31 15:10:13 2021 Epoch 34, lr: 0.0001922, train loss: 0.2046, valid loss: 0.2004, mean_rce: 25.20, retweet: 29.78, reply: 25.55, like: 31.07, retweet_comment: 14.38
rce_best increased (25.124504 --> 25.196066).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 15:12:20 2021 Epoch: 35


100%|██████████| 5/5 [04:04<00:00, 48.84s/it]
loss: 0.1989, smth: 0.2023: 100%|██████████| 14635/14635 [1:01:35<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.40it/s]


Mon May 31 16:20:17 2021 Epoch 35, lr: 0.0001654, train loss: 0.2042, valid loss: 0.1998, mean_rce: 25.45, retweet: 29.85, reply: 25.68, like: 31.38, retweet_comment: 14.86
rce_best increased (25.196066 --> 25.445557).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 16:22:21 2021 Epoch: 36


100%|██████████| 5/5 [03:59<00:00, 47.83s/it]
loss: 0.2033, smth: 0.2039: 100%|██████████| 13586/13586 [57:15<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:08<00:00, 78.47it/s]


Mon May 31 17:25:52 2021 Epoch 36, lr: 0.0001403, train loss: 0.2039, valid loss: 0.1996, mean_rce: 25.48, retweet: 29.87, reply: 25.80, like: 31.44, retweet_comment: 14.81
rce_best increased (25.445557 --> 25.481163).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 17:27:57 2021 Epoch: 37


100%|██████████| 5/5 [04:03<00:00, 48.66s/it]
loss: 0.2083, smth: 0.2044: 100%|██████████| 14526/14526 [1:01:14<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:09<00:00, 77.89it/s]


Mon May 31 18:35:31 2021 Epoch 37, lr: 0.0001170, train loss: 0.2035, valid loss: 0.1990, mean_rce: 25.66, retweet: 30.13, reply: 25.85, like: 31.66, retweet_comment: 15.02
rce_best increased (25.481163 --> 25.664833).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 18:37:38 2021 Epoch: 38


100%|██████████| 5/5 [04:06<00:00, 49.27s/it]
loss: 0.1996, smth: 0.2025: 100%|██████████| 14636/14636 [1:01:42<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:10<00:00, 77.21it/s]


Mon May 31 19:45:46 2021 Epoch 38, lr: 0.0000955, train loss: 0.2034, valid loss: 0.1991, mean_rce: 25.70, retweet: 30.18, reply: 26.05, like: 31.56, retweet_comment: 15.01
rce_best increased (25.664833 --> 25.700359).  Saving model ...
Mon May 31 19:47:52 2021 Epoch: 39


100%|██████████| 5/5 [04:05<00:00, 49.05s/it]
loss: 0.2044, smth: 0.2017: 100%|██████████| 14637/14637 [1:01:39<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:09<00:00, 78.14it/s]


Mon May 31 20:55:54 2021 Epoch 39, lr: 0.0000760, train loss: 0.2034, valid loss: 0.1989, mean_rce: 25.72, retweet: 30.13, reply: 26.03, like: 31.71, retweet_comment: 15.02
rce_best increased (25.700359 --> 25.722523).  Saving model ...
Mon May 31 20:57:59 2021 Epoch: 40


100%|██████████| 5/5 [03:55<00:00, 47.15s/it]
loss: 0.2048, smth: 0.2020: 100%|██████████| 13508/13508 [56:55<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:09<00:00, 78.15it/s]


Mon May 31 22:01:06 2021 Epoch 40, lr: 0.0000585, train loss: 0.2026, valid loss: 0.1983, mean_rce: 25.92, retweet: 30.35, reply: 26.28, like: 31.94, retweet_comment: 15.10
rce_best increased (25.722523 --> 25.916914).  Saving model ...
Mon May 31 22:03:15 2021 Epoch: 41


100%|██████████| 5/5 [04:07<00:00, 49.55s/it]
loss: 0.1972, smth: 0.2034: 100%|██████████| 14637/14637 [1:01:40<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:09<00:00, 78.15it/s]


Mon May 31 23:11:20 2021 Epoch 41, lr: 0.0000432, train loss: 0.2029, valid loss: 0.1983, mean_rce: 25.95, retweet: 30.32, reply: 26.33, like: 31.92, retweet_comment: 15.24
rce_best increased (25.916914 --> 25.952074).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Mon May 31 23:13:27 2021 Epoch: 42


100%|██████████| 5/5 [04:00<00:00, 48.07s/it]
loss: 0.1962, smth: 0.2038: 100%|██████████| 14473/14473 [1:00:59<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:09<00:00, 78.11it/s]


Tue Jun  1 00:20:43 2021 Epoch 42, lr: 0.0000302, train loss: 0.2027, valid loss: 0.1980, mean_rce: 26.01, retweet: 30.44, reply: 26.35, like: 32.02, retweet_comment: 15.21
rce_best increased (25.952074 --> 26.005121).  Saving model ...
Tue Jun  1 00:22:49 2021 Epoch: 43


100%|██████████| 5/5 [03:53<00:00, 46.71s/it]
loss: 0.1964, smth: 0.2036: 100%|██████████| 14187/14187 [59:45<00:00,  3.96it/s]
100%|██████████| 10083/10083 [02:09<00:00, 77.72it/s]


Tue Jun  1 01:28:46 2021 Epoch 43, lr: 0.0000194, train loss: 0.2026, valid loss: 0.1980, mean_rce: 26.05, retweet: 30.51, reply: 26.33, like: 32.02, retweet_comment: 15.32
rce_best increased (26.005121 --> 26.048130).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Tue Jun  1 01:30:52 2021 Epoch: 44


100%|██████████| 5/5 [04:00<00:00, 48.14s/it]
loss: 0.2028, smth: 0.2035: 100%|██████████| 14784/14784 [1:02:35<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:10<00:00, 77.47it/s]


Tue Jun  1 02:39:47 2021 Epoch 44, lr: 0.0000109, train loss: 0.2026, valid loss: 0.1979, mean_rce: 26.06, retweet: 30.47, reply: 26.38, like: 32.05, retweet_comment: 15.31
rce_best increased (26.048130 --> 26.055038).  Saving model ...


  0%|          | 0/5 [00:00<?, ?it/s]

Tue Jun  1 02:41:56 2021 Epoch: 45


100%|██████████| 5/5 [03:54<00:00, 46.84s/it]
loss: 0.2097, smth: 0.2018: 100%|██████████| 13823/13823 [58:26<00:00,  3.94it/s]
100%|██████████| 10083/10083 [02:10<00:00, 77.24it/s]


Tue Jun  1 03:46:35 2021 Epoch 45, lr: 0.0000049, train loss: 0.2024, valid loss: 0.1980, mean_rce: 26.05, retweet: 30.50, reply: 26.39, like: 32.02, retweet_comment: 15.30
Tue Jun  1 03:48:07 2021 Epoch: 46


100%|██████████| 5/5 [04:02<00:00, 48.46s/it]
loss: 0.2023, smth: 0.2006: 100%|██████████| 14635/14635 [1:01:45<00:00,  3.95it/s]
100%|██████████| 10083/10083 [02:10<00:00, 77.51it/s]


Tue Jun  1 04:56:12 2021 Epoch 46, lr: 0.0000012, train loss: 0.2025, valid loss: 0.1979, mean_rce: 26.04, retweet: 30.50, reply: 26.37, like: 32.05, retweet_comment: 15.24


In [None]:
# 20 parts, 5 epochs
mean_rce: 16.51, retweet: 20.27, reply: 18.26, like: 20.17, retweet_comment: 7.35
                    
Epoch 20, lr: 0.0000794, train loss: 0.2142, valid loss: 0.2233, 
mean_rce: 18.66, retweet: 23.00, reply: 20.36, like: 21.72, retweet_comment: 9.56     
                    
Epoch 35, lr: 0.0001654, train loss: 0.2105, valid loss: 0.2220, 
mean_rce: 19.25, retweet: 23.58, reply: 21.08, like: 22.09, retweet_comment: 10.27                    

# xgb feat NN                    
mean_rce: 20.25, retweet: 23.39, reply: 19.07, like: 13.02, retweet_comment: 25.54                    

## load best ep and inference

In [27]:
sd = torch.load(f'../models/{model_name}_best.pth')
sd = {k[7:] if k.startswith('module.') else k: sd[k] for k in sd.keys()}
model.load_state_dict(sd, strict=True)

<All keys matched successfully>

In [28]:
label_names = sorted(label_names)
label_names

['like', 'reply', 'retweet', 'retweet_comment']

In [29]:
model.eval()
val_loss = []
LOGITS = []
TARGETS = []
with torch.no_grad():
    for batch in tqdm(valid_loader):
        x_cat, x_cont, text_tok, targets = batch
        x_cat = x_cat.cuda()     
        x_cont = x_cont.cuda()
        text_tok = text_tok.cuda()
        targets = targets.cuda()            
        logits = model(x_cat, x_cont, text_tok)
        loss = criterion(logits, targets)
        val_loss.append(loss.item())
        LOGITS.append(logits.cpu())
        TARGETS.append(targets.cpu())

LOGITS = torch.cat(LOGITS)
TARGETS = torch.cat(TARGETS)
rce = {}
for i in range(4):
    rce[label_names[i]] = compute_rce_fast(cp.asarray(LOGITS[:,i].sigmoid()),cp.asarray(TARGETS[:,i])).get()            
mean_rce = np.mean([v for k,v in rce.items()])
mean_rce

100%|██████████| 10083/10083 [01:43<00:00, 97.81it/s] 


20.13601

In [30]:
# df_quantile = pd.concat([pd.read_parquet(path)[['quantile']] for path in VALID_PATHS]).reset_index(drop=True)
# df_quantile = df_quantile.apply(np.expm1).round().astype(int)
df_quantile = valid[['quantile']].copy().reset_index(drop=True)
df_quantile.shape

yquantile = cupy.asarray(df_quantile.values)
oof = cupy.asarray(LOGITS.sigmoid())
yvalid = cupy.asarray(TARGETS)

In [31]:
from util import compute_prauc, average_precision_score,display_score

rce_output = {}
ap_output = {}
for i in range(4):
    prauc_out = []
    rce_out = []
    ap_out = []
    for j in range(5):
        this_quantile_idx = (df_quantile == j)['quantile'].values
        yvalid_tmp = yvalid[this_quantile_idx][:, i]
        oof_tmp = oof[this_quantile_idx][:, i]
        prauc = compute_prauc(oof_tmp, yvalid_tmp)
        rce   = compute_rce_fast(oof_tmp, yvalid_tmp).item()
        ap    = average_precision_score(cupy.asnumpy(yvalid_tmp),cupy.asnumpy(oof_tmp))
        prauc_out.append(prauc)
        rce_out.append(rce)
        ap_out.append(ap)
    rce_output[label_names[i]] = rce_out
    ap_output[label_names[i]] = ap_out

In [32]:
print(model_name)
display_score(rce_output, ap_output)

gru_cat5_cont36_frzemb768_gru128_len64_thr50_3weeks
Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment
        0          0.4827     25.3136     0.2257     18.1147     0.7328     17.5110     0.0545      9.7572
        1          0.4643     24.6773     0.2055     18.0764     0.7295     17.6294     0.0554     10.1372
        2          0.4435     23.6253     0.2155     18.7955     0.7326     18.1288     0.0489      8.8695
        3          0.4352     23.1426     0.2309     19.5610     0.7339     18.9700     0.0433      8.8682
        4          0.4320     24.3085     0.2189     21.0982     0.7610     27.1258     0.0494     11.0005
     Average       0.4515     24.2134     0.2193     19.1291     0.7379     19.8730     0.0503      9.7265


In [28]:
# XGB
%%time
display_score(rce_output, ap_output)

Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment
        0          0.4767     23.8403     0.2466     19.7694     0.7496     18.7935     0.0803     11.3031
        1          0.4608     24.3840     0.2391     20.9897     0.7444     20.0392     0.0680     10.6829
        2          0.4501     24.8955     0.2513     22.2341     0.7393     20.2754     0.0682     11.2699
        3          0.4384     24.5673     0.2677     23.6996     0.7334     20.9961     0.0662     11.5356
        4          0.4124     24.7017     0.2411     24.6334     0.7059     20.6678     0.0710     15.1545
     Average       0.4477     24.4778     0.2492     22.2652     0.7345     20.1544     0.0707     11.9892
CPU times: user 0 ns, sys: 1 ms, total: 1 ms
Wall time: 950 µs


## load best ep and inference LB valid

In [12]:
def read_norm_merge(ddf):

    ddf['quantile'] = 0
    quantiles = [ 240,  588, 1331, 3996]
    for i, quant in enumerate(quantiles):
        ddf['quantile'] = (ddf['quantile']+(ddf['a_follower_count']>quant).astype('int8')).astype('int8')

    ddf['date'] = cudf.to_datetime(ddf['timestamp'], unit='s')
       
    ddf['a_ff_rate'] = (ddf['a_following_count'] / ddf['a_follower_count']).astype('float32')
    ddf['b_ff_rate'] = (ddf['b_follower_count']  / ddf['b_following_count']).astype('float32')
    ddf['ab_fing_rate'] = (ddf['a_following_count'] / ddf['b_following_count']).astype('float32')
    ddf['ab_fer_rate'] = (ddf['a_follower_count'] / (1+ddf['b_follower_count'])).astype('float32')
    ddf['a_age'] = ddf['a_account_creation'].astype('int16') + 128
    ddf['b_age'] = ddf['b_account_creation'].astype('int16') + 128
    ddf['ab_age_dff'] = ddf['b_age'] - ddf['a_age']
    ddf['ab_age_rate'] = ddf['a_age']/(1+ddf['b_age'])

    ## Normalize
    for col in NUMERIC_COLUMNS:
        if col == 'tw_len_quest':
            ddf[col] = np.clip(ddf[col].values.get(),0,None)
        if ddf[col].dtype == 'uint16':
            ddf[col].astype('int32')

        if col == 'ab_age_dff':
            ddf[col] = ddf[col] / 256.            
        elif 'int' in str(ddf[col].dtype) or 'float' in str(ddf[col].dtype):    
            ddf[col] = np.log1p(ddf[col])

        if ddf[col].dtype == 'float64':
            ddf[col] = ddf[col].astype('float32') 

    ## get categorical embedding id        
    for col in CAT_COLUMNS:
        ddf[col] = ddf[col].astype('float')
        if col in ['a_user_id','b_user_id']:
            mapping_col = 'a_user_id_b_user_id'
        else:
            mapping_col = col
        mapping = cudf.read_parquet(f'/raid/recsys_pre_TE_w_tok/workflow_232parts_joint_thr25/categories/unique.{mapping_col}.parquet').reset_index()
        mapping.columns = ['index',col]
        ddf = ddf.merge(mapping, how='left', on=col).drop(columns=[col]).rename(columns={'index':col})
        ddf[col] = ddf[col].fillna(0).astype('int')        

    label_names = ['reply', 'retweet', 'retweet_comment', 'like']
    DONT_USE = ['timestamp','a_account_creation','b_account_creation','engage_time',
                'fold', 'dt_dow', 'a_account_creation', 
                'b_account_creation', 'elapsed_time', 'links','domains','hashtags','id', 'date', 'is_train', 
                'tw_hash0', 'tw_hash1', 'tw_hash2', 'tw_http0', 'tw_uhash', 'tw_hash', 'tw_word0', 
                'tw_word1', 'tw_word2', 'tw_word3', 'tw_word4', 'dt_minute', 'dt_second',
               'dt_day', 'group', 'text', 'tweet_id', 'tw_original_user0', 'tw_original_user1', 'tw_original_user2',
                'tw_rt_user0', 'tw_original_http0', 'tw_tweet',]
    DONT_USE = [c for c in ddf.columns if c in DONT_USE]
    gc.collect(); gc.collect()
    
    return ddf.drop(columns=DONT_USE)

In [13]:
%%time
df = cudf.read_parquet('/raid/recsys_valid/valid_proc.parquet',num_rows=7_000_000)
df = read_norm_merge(df).to_pandas()

df2 = cudf.read_parquet('/raid/recsys_valid/valid_proc.parquet',skiprows=7_000_000)
df2 = read_norm_merge(df2).to_pandas()

valid = pd.concat([df,df2])
del df,df2
gc.collect()

valid_dataset = AllDataset(valid, max_len_txt, NUMERIC_COLUMNS, CAT_COLUMNS)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers) 
valid.shape, len(valid_loader)

CPU times: user 17.4 s, sys: 10.5 s, total: 27.9 s
Wall time: 30 s


((14461760, 47), 14123)

In [14]:
# sd = torch.load(f'../models/{model_name}_best.pth')
sd = torch.load('/home/bo/kaggle/recsys/recsysChallenge2021/bo/sub/v11_len48_thr25_joint_MF/MF_len48_joint_thr25_3weeks_best.pth')
sd = {k[7:] if k.startswith('module.') else k: sd[k] for k in sd.keys()}
model.load_state_dict(sd, strict=True)

<All keys matched successfully>

In [15]:
label_names = sorted(label_names)
label_names

['like', 'reply', 'retweet', 'retweet_comment']

In [16]:
model.eval()
val_loss = []
LOGITS = []
TARGETS = []
with torch.no_grad():
    for batch in tqdm(valid_loader):
        x_cat, x_cont, text_tok, targets = batch
        x_cat = x_cat.cuda()     
        x_cont = x_cont.cuda()
        text_tok = text_tok.cuda()
        targets = targets.cuda()            
        logits = model(x_cat, x_cont, text_tok)
        loss = criterion(logits, targets)
        val_loss.append(loss.item())
        LOGITS.append(logits.cpu())
        TARGETS.append(targets.cpu())

LOGITS = torch.cat(LOGITS)
TARGETS = torch.cat(TARGETS)
rce = {}
for i in range(4):
    rce[label_names[i]] = compute_rce_fast(cp.asarray(LOGITS[:,i].sigmoid()),cp.asarray(TARGETS[:,i])).get()            
mean_rce = np.mean([v for k,v in rce.items()])
mean_rce

100%|██████████| 14123/14123 [02:46<00:00, 84.59it/s]


13.316917

In [17]:
# df_quantile = pd.concat([pd.read_parquet(path)[['quantile']] for path in VALID_PATHS]).reset_index(drop=True)
# df_quantile = df_quantile.apply(np.expm1).round().astype(int)
df_quantile = valid[['quantile']].copy().reset_index(drop=True)
df_quantile.shape

yquantile = cupy.asarray(df_quantile.values)
oof = cupy.asarray(LOGITS.sigmoid())
yvalid = cupy.asarray(TARGETS)

In [18]:
from util import compute_prauc, average_precision_score,display_score

rce_output = {}
ap_output = {}
for i in range(4):
    prauc_out = []
    rce_out = []
    ap_out = []
    for j in range(5):
        this_quantile_idx = (df_quantile == j)['quantile'].values
        yvalid_tmp = yvalid[this_quantile_idx][:, i]
        oof_tmp = oof[this_quantile_idx][:, i]
        prauc = compute_prauc(oof_tmp, yvalid_tmp)
        rce   = compute_rce_fast(oof_tmp, yvalid_tmp).item()
        ap    = average_precision_score(cupy.asnumpy(yvalid_tmp),cupy.asnumpy(oof_tmp))
        prauc_out.append(prauc)
        rce_out.append(rce)
        ap_out.append(ap)
    rce_output[label_names[i]] = rce_out
    ap_output[label_names[i]] = ap_out

In [19]:
# public test
print(model_name)
display_score(rce_output, ap_output)

MF_len48_joint_thr25_3weeks
Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment
        0          0.3648     19.1185     0.1768     17.4406     0.5983      8.8159     0.0343      8.4536
        1          0.3457     18.4016     0.1811     17.8598     0.5753      6.9555     0.0309      8.3305
        2          0.3388     17.9507     0.2000     18.7393     0.5646      6.6237     0.0306      8.0775
        3          0.3504     17.6180     0.2199     19.9633     0.5772      7.1854     0.0310      8.4171
        4          0.3247     16.5028     0.1267     14.6986     0.6501     10.9945     0.0286      8.7294
     Average       0.3449     17.9183     0.1809     17.7403     0.5931      8.1150     0.0311      8.4016


In [None]:
boliu0	sub_0601   0.3446	17.9344     	0.1829	17.8545	    0.5926	    8.0979	    0.0314	   8.4308	9 hours	258

In [31]:
print(model_name)
display_score(rce_output, ap_output)

MF_len48_joint_thr25_3weeks
Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment
        0          0.5608     32.5540     0.2911     23.4060     0.8070     28.8335     0.0951     15.3339
        1          0.5250     30.9199     0.2968     24.7680     0.8037     29.8295     0.0780     14.2486
        2          0.5138     30.4446     0.3222     26.5348     0.7998     30.7027     0.0694     14.0940
        3          0.5267     31.0953     0.3355     27.6502     0.8038     31.8786     0.0766     15.1034
        4          0.4854     29.5893     0.2521     24.8843     0.7972     33.1890     0.0790     15.7639
     Average       0.5223     30.9206     0.2995     25.4487     0.8023     30.8866     0.0797     14.9087


In [25]:
print(model_name)
display_score(rce_output, ap_output)

MF_len48_joint_thr25_3weeks
Quantile Group|AP Retweet|RCE Retweet|  AP Reply|  RCE Reply|   AP Like|   RCE Like|AP RT comment|RCE RT comment
        0          0.5756     33.0314     0.2987     22.8756     0.8081     28.1401     0.0998     15.3117
        1          0.5540     32.3156     0.2850     23.5546     0.8063     29.1117     0.0939     15.5322
        2          0.5309     31.2906     0.2963     24.6103     0.8050     29.6993     0.0826     14.4356
        3          0.5168     30.4555     0.3086     25.7036     0.8003     30.3232     0.0723     14.2019
        4          0.4954     29.9846     0.2819     26.2819     0.7986     32.8644     0.0774     15.5282
     Average       0.5345     31.4155     0.2941     24.6052     0.8037     30.0277     0.0852     15.0019
