In [1]:
import os
import gc
import time
import psutil
from pathlib import Path

import pandas as pd
import numpy as np
import random
import math

pd.set_option('display.max_rows', 100)
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding

from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

env: TOKENIZERS_PARALLELISM=false


In [2]:
class CFG:
    mode = 'all'
    input_dir = '/kaggle/input/learning-equality-curriculum-recommendations'
    content_dir = os.path.join(input_dir, 'content.csv')
    correlation_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')
    fold_dir = os.path.join('/kaggle/input/k12-utils/cv_data')
    
    seed = 17
    n_fold = 4
    fold = 0
    
    special_tokens = [
        '[TITLE]',
        '[DESCRIPTION]',
        '[video]',
        '[document]',
        '[html5]',
        '[exercise]',
        '[audio]',
    ]
    
    model_dir = '/kaggle/input/lecr-ensemble-data1/sentence-transformers-all-MiniLM-L6-v2'
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir, 'tokenizer'), padding=True)
    tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
    config = AutoConfig.from_pretrained(os.path.join(model_dir, 'config'), output_hidden_states = True)
    model = AutoModel.from_pretrained(os.path.join(model_dir, 'model'), config=config)
    model.resize_token_embeddings(len(tokenizer))
    
    max_len = 256
    n_nearest = 100
    
    print_freq = 200
    num_workers = 2
    model_name = 'all-MiniLM-L6-v2'
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    epochs = 6
    encoder_lr = 2e-5
    decoder_lr = 2e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 200
    weight_decay = 0.01
    max_grad_norm = 0.012
    seed = 11
    no_of_gpu = 1

In [3]:
def prepare_input(topic, content, cfg):
    topic = topic_mapping[topic]
    content = content_mapping[content]
#     print(topic, content)
    inputs = cfg.tokenizer.encode_plus(
        topic, content,
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        padding = 'max_length',
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [4]:
class RetrievalDataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.topic = df['topic_id'].values
        self.content = df['content_id'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, item):
        inputs = prepare_input(self.topic[item], self.content[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label

In [5]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class RetrivalModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = cfg.config
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = cfg.model
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [6]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [7]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, cfg, accelerator):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, target) in enumerate(train_loader):
        inputs = collate(inputs)
        # for k, v in inputs.items():
        #     inputs[k] = v.to(accelerator.device)
        # target = target.to(accelerator.device)
        batch_size = target.size(0)
        
#         print(batch_size)
#         print(inputs['input_ids'].shape)
#         print(target.shape)
        
        # forward pass
        y_preds = model(inputs)
        
        # loss
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)

        # backward pass
        accelerator.backward(loss)
        grad_norm = accelerator.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            accelerator.print('Epoch: [{0}][{1}/{2}] '
                              'Elapsed {remain:s} '
                              'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                              'Grad: {grad_norm:.4f}  '
                              'LR: {lr:.8f}  '
                              .format(epoch + 1, 
                                      step, 
                                      len(train_loader), 
                                      remain = timeSince(start, float(step + 1) / len(train_loader)),
                                      loss = losses,
                                      grad_norm = grad_norm if grad_norm is not None else 0,
                                      lr = scheduler.get_lr()[0]))
    return losses.avg

In [8]:
def valid_fn(valid_loader, model, criterion, cfg, accelerator):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, target) in enumerate(valid_loader):
        # pass data to GPU
        inputs = collate(inputs)
        # for k, v in inputs.items():
        #     inputs[k] = v.to(accelerator.device)
        # target = target.to(accelerator.device)
        batch_size = target.size(0)
        
        # forward pass
        with torch.no_grad():
            y_preds = model(inputs)
            
        # losses
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            accelerator.print('EVAL: [{0}/{1}] '
                              'Elapsed {remain:s} '
                              'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                              .format(step, 
                                      len(valid_loader),
                                      loss = losses,
                                      remain = timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds, axis = 0)
    return losses.avg, predictions

In [9]:
def init_model(accelerator, cfg):
    with accelerator.main_process_first():
        model = RetrivalModel(cfg)
    return model

In [10]:
cfg = CFG

In [11]:
def get_train_test_fold(fold, mapping, cfg):
    train_topic = set(pd.read_csv(os.path.join(cfg.fold_dir, f'fold_{fold}', 'train_topic.csv'))['topic_id'])
    test_topic = set(pd.read_csv(os.path.join(cfg.fold_dir, f'fold_{fold}', 'test_topic.csv'))['topic_id'])
    train_mapping = mapping[mapping['topic_id'].isin(train_topic)].reset_index(drop=True)
    test_mapping = mapping[mapping['topic_id'].isin(test_topic)].reset_index(drop=True)
    return train_mapping, test_mapping

In [12]:
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    # torch.backends.cudnn.deterministic = True
    set_seed(cfg.seed)

In [13]:
seed_everything(CFG)

In [14]:
topic_field = pd.read_csv('/kaggle/input/finetune-dataset/topic_field.csv', index_col=0)
content_field = pd.read_csv('/kaggle/input/finetune-dataset/content_field.csv', index_col=0)
mapping = pd.read_csv('/kaggle/input/finetune-dataset/train_mapping.csv', index_col=0)
topic_mapping = dict(zip(topic_field['id'], topic_field['field']))
content_mapping = dict(zip(content_field['id'], content_field['field']))
train_mapping, test_mapping = get_train_test_fold(0, mapping, CFG)

In [15]:
accelerator = Accelerator(mixed_precision='fp16')
accelerator.print(' ')
train, valid = get_train_test_fold(0, mapping, CFG)
valid_labels = valid['target'].values
train_dataset = RetrievalDataset(train, cfg)
valid_dataset = RetrievalDataset(valid, cfg)
train_loader = DataLoader(
    train_dataset, 
    batch_size = cfg.batch_size, 
    shuffle = True, 
    num_workers = cfg.num_workers, 
    pin_memory = True, 
    drop_last = True
)
valid_loader = DataLoader(
    valid_dataset, 
    batch_size = cfg.batch_size, 
    shuffle = False, 
    num_workers = cfg.num_workers, 
    pin_memory = True, 
    drop_last = False
)
model = init_model(accelerator, cfg)

def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
        'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
        'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
        'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters
optimizer_parameters = get_optimizer_params(
    model, 
    encoder_lr = cfg.encoder_lr, 
    decoder_lr = cfg.decoder_lr,
    weight_decay = cfg.weight_decay
)
optimizer = AdamW(
    optimizer_parameters, 
    lr = cfg.encoder_lr, 
    eps = cfg.eps, 
    betas = cfg.betas
)
num_train_steps = int(len(train) / cfg.batch_size * cfg.epochs)
num_warmup_steps = num_train_steps * cfg.warmup_ratio
scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = num_warmup_steps, 
    num_training_steps = num_train_steps, 
    num_cycles = cfg.num_cycles
    )
criterion = nn.BCEWithLogitsLoss(reduction = "mean")
model, optimizer, train_loader, valid_loader = accelerator.prepare(
    model, optimizer, train_loader, valid_loader
)
accelerator.print(f"Training on {len(train)} samples, Validating on {len(valid)} samples")

 
Training on 1002270 samples, Validating on 79032 samples


In [16]:
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.00001, 0.5, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topic_id'])['content_id'].unique().reset_index()
        x_val1['content_id'] = x_val1['content_id'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topic_id'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
#         print(x_val_r)
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold

In [17]:
correlations = pd.read_csv(cfg.correlation_dir)
best_score = 0
for epoch in range(cfg.epochs):
    start_time = time.time()
    ################
    #     Train    #
    ################
    avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, cfg, accelerator)
    
    ################
    #  Validation  #
    ################
    avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, cfg, accelerator)

    # Save the model
    accelerator.wait_for_everyone() 
    model = accelerator.unwrap_model(model)

    # Compute f2_score
    score, threshold = get_best_threshold(valid, predictions, correlations)
    elapsed = time.time() - start_time
    accelerator.print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
    accelerator.print(f'Epoch {epoch+1} - Score: {score:.6f} - Threshold: {threshold:.6f}')
    if score > best_score:
        best_score = score
        accelerator.print(f'Epoch {epoch+1} - Save Best Score: {best_score:.6f} Model')

        val_predictions = predictions
        
    torch.save(
        {'model': model.state_dict(), 'predictions': predictions}, 
        f"{cfg.model_name.replace('/', '-')}_fold{cfg.fold}_{cfg.seed}_{score}.pth"
        )

Epoch: [1][0/5011] Elapsed 0m 2s (remain 191m 52s) Loss: 0.6913(0.6913) Grad: 0.0000  LR: 0.00000001  
Epoch: [1][200/5011] Elapsed 2m 58s (remain 71m 17s) Loss: 0.6362(0.6698) Grad: 0.0000  LR: 0.00000134  
Epoch: [1][400/5011] Elapsed 5m 55s (remain 68m 3s) Loss: 0.5889(0.6258) Grad: 0.0000  LR: 0.00000267  
Epoch: [1][600/5011] Elapsed 8m 51s (remain 65m 0s) Loss: 0.5739(0.6061) Grad: 0.0000  LR: 0.00000400  
Epoch: [1][800/5011] Elapsed 11m 47s (remain 62m 0s) Loss: 0.5684(0.5954) Grad: 0.0000  LR: 0.00000533  
Epoch: [1][1000/5011] Elapsed 14m 44s (remain 59m 2s) Loss: 0.5578(0.5855) Grad: 0.0000  LR: 0.00000666  
Epoch: [1][1200/5011] Elapsed 17m 40s (remain 56m 5s) Loss: 0.4824(0.5768) Grad: 0.0000  LR: 0.00000799  
Epoch: [1][1400/5011] Elapsed 20m 37s (remain 53m 7s) Loss: 0.5504(0.5684) Grad: 0.0000  LR: 0.00000932  
Epoch: [1][1600/5011] Elapsed 23m 33s (remain 50m 10s) Loss: 0.4613(0.5603) Grad: 0.0000  LR: 0.00001065  
Epoch: [1][1800/5011] Elapsed 26m 29s (remain 47m 13s)

  import sys


Epoch 1 - avg_train_loss: 0.4488  avg_val_loss: 0.4690  time: 4682s
Epoch 1 - Score: 0.725300 - Threshold: 0.027010
Epoch 1 - Save Best Score: 0.725300 Model
Epoch: [2][0/5011] Elapsed 0m 1s (remain 104m 36s) Loss: 0.3004(0.3004) Grad: 0.0000  LR: 0.00001973  
Epoch: [2][200/5011] Elapsed 2m 57s (remain 70m 49s) Loss: 0.2598(0.3244) Grad: 0.0000  LR: 0.00001967  
Epoch: [2][400/5011] Elapsed 5m 54s (remain 67m 50s) Loss: 0.3537(0.3215) Grad: 0.0000  LR: 0.00001961  
Epoch: [2][600/5011] Elapsed 8m 50s (remain 64m 52s) Loss: 0.2887(0.3196) Grad: 0.0000  LR: 0.00001955  
Epoch: [2][800/5011] Elapsed 11m 46s (remain 61m 54s) Loss: 0.3740(0.3186) Grad: 0.0000  LR: 0.00001947  
Epoch: [2][1000/5011] Elapsed 14m 43s (remain 58m 57s) Loss: 0.2756(0.3168) Grad: 0.0000  LR: 0.00001940  
Epoch: [2][1200/5011] Elapsed 17m 39s (remain 56m 0s) Loss: 0.3351(0.3148) Grad: 0.0000  LR: 0.00001932  
Epoch: [2][1400/5011] Elapsed 20m 35s (remain 53m 3s) Loss: 0.2839(0.3129) Grad: 0.0000  LR: 0.00001923  

In [18]:
torch.cuda.empty_cache()
gc.collect()
# Get best threshold
best_score, best_threshold = get_best_threshold(valid, val_predictions, correlations)
accelerator.print(f'Our CV score is {best_score} using a threshold of {best_threshold}')

  import sys


Our CV score is 0.7661 using a threshold of 0.00701


In [19]:
# topic_field = pd.read_csv('/kaggle/input/finetune-dataset/topic_field.csv', index_col=0)
# content_field = pd.read_csv('/kaggle/input/finetune-dataset/content_field.csv', index_col=0)
# mapping = pd.read_csv('/kaggle/input/finetune-dataset/train_mapping.csv', index_col=0)
# topic_mapping = dict(zip(topic_field['id'], topic_field['field']))
# content_mapping = dict(zip(content_field['id'], content_field['field']))
# train_mapping, test_mapping = get_train_test_fold(0, mapping, CFG)