In [1]:
import os
import gc
import psutil
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

pd.set_option('display.max_rows', 100)
%env TOKENIZERS_PARALLELISM=false
RE_EXTRACT_EMBEDDINGS = True

env: TOKENIZERS_PARALLELISM=false


In [2]:
def print_memory_usage():
    # Getting % usage of virtual_memory ( 3rd field)
    print('RAM memory % used:', psutil.virtual_memory()[2])
    # Getting usage of virtual_memory in GB ( 4th field)
    print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)

In [3]:
class CFG:
    input_dir = '/kaggle/input/k12-utils/cv_data/fold_0'

    content_dir = os.path.join(input_dir, 'content.csv')
    correlation_dir = os.path.join(input_dir, 'correlations.csv')
    submission_dir = os.path.join(input_dir, 'sample_submission.csv')
    topic_dir = os.path.join(input_dir, 'topics.csv')
    
    seed = 17
    n_fold = 4
    
    device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [4]:
print_memory_usage()

RAM memory % used: 6.3
RAM Used (GB): 0.70688768


In [5]:
# model to get embeddings
class CFG1:
    model_dir = '/kaggle/input/lecr-ensemble-data1/sentence-transformers-all-MiniLM-L6-v2'
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir, 'tokenizer'))
    model = AutoModel.from_pretrained(os.path.join(model_dir, 'model'))
    max_len = 512
    n_nearest = 50
    device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [6]:
def prepare_topic(cfg):
    topic = pd.read_csv(cfg.topic_dir)
    topic['title'] = topic['title'].fillna(topic["description"]).fillna('')
    return topic
    
def prepare_content(cfg):
    content = pd.read_csv(cfg.content_dir)
    content['title'] = content['title'].fillna(content["description"]).fillna(content['text']).fillna('')
    return content

def prepare_correlation(cfg):
    correlation = pd.read_csv(cfg.correlation_dir)
    correlation['content_id']=correlation['content_ids'].apply(lambda x: x.split())
    correlation = correlation.explode('content_id').drop(columns='content_ids')
    return correlation

def merge_train_data(cfg, content_cols=['id', 'title'], topic_cols=['id', 'title']):
    dup_cols = ['id', 'title', 'description', 'language']
    topic = prepare_topic(cfg)
    content = prepare_content(cfg)
    correlation = prepare_correlation(cfg)
    correlation = (
        correlation
        .merge(content[content_cols], left_on='content_id', right_on='id', how='left')
        .drop(columns='id')
        .rename(columns=dict(zip(dup_cols, ['content_'+x for x in dup_cols])))
        .merge(topic[topic_cols], left_on='topic_id', right_on='id', how='left')
        .drop(columns='id')
        .rename(columns=dict(zip(dup_cols, ['topic_'+x for x in dup_cols])))
    )
    return correlation

In [7]:
print_memory_usage()

RAM memory % used: 7.4
RAM Used (GB): 0.903380992


## Language Pair

In [8]:
def prepare_language_match(cfg, mode='train'):
    topic = pd.read_csv(cfg.topic_dir)[['id', 'language']]
    content = pd.read_csv(cfg.content_dir)[['id', 'language']]
    if mode == 'train':
        corr = pd.read_csv(cfg.correlation_dir)
    elif mode == 'valid':
        corr = pd.read_csv(cfg.submission_dir)
    topic = topic.merge(corr, left_on='id', right_on='topic_id')[['id', 'language']]
    match_dict = {}
    for language in topic['language'].unique():
        match_dict[language] = (topic.query('language==@language')[['id']], content.query('language==@language')[['id']])
    return match_dict

In [9]:
def prepare_match_features(topic, content, cfg):
    topic = topic[['id']].merge(prepare_topic(cfg)[['id', 'title']], on='id', how='left')
    content = content[['id']].merge(prepare_content(cfg)[['id', 'title']], on='id', how='left')
    return topic, content

In [10]:
%%time
topic_content_match = prepare_language_match(CFG)

CPU times: user 10.3 s, sys: 1.56 s, total: 11.8 s
Wall time: 20.8 s


In [11]:
print_memory_usage()

RAM memory % used: 16.3
RAM Used (GB): 2.386935808


## Get Embeddings from Model

In [12]:
# tokenizer = CFG1.tokenizer
# model = CFG1.model
# model.to(CFG1.device)
# print('models paramters:', sum(p.numel() for p in model.parameters()))
# params_count = sum(1 for x in model.parameters())
# for i, (name, param) in enumerate(model.named_parameters()):
#     if i > params_count - 10:
#         print(name, param.requires_grad, param.shape)

In [13]:
def get_embeddings(data:pd.Series, cfg):
    tokenizer = cfg.tokenizer
    model = cfg.model
    model= nn.DataParallel(model)
    model.to(cfg.device)
    data = list(data.fillna(''))
    gap = 500
    
    token_outs = []
    # uniform dynamic padding
    for i in tqdm(range(0, len(data), gap), desc='tokenization'):
        batch_tokens=tokenizer(data[i:i+gap], truncation=True, padding=True, return_tensors='pt')
        token_outs.append(batch_tokens)
        
    outs = []
    model.to(cfg.device)
    # ??
    model.eval()
    
    with torch.no_grad():
        for batch_tokens in tqdm(token_outs, total=len(token_outs), desc='model output'):
            inputs = batch_tokens['input_ids'].to(cfg.device)
            attention_mask = batch_tokens['attention_mask'].to(cfg.device)
            out = model(inputs, attention_mask=attention_mask).last_hidden_state.mean(1)
            outs.append(out.cpu().numpy())
    return np.concatenate(outs)

In [14]:
print_memory_usage()

RAM memory % used: 16.3
RAM Used (GB): 2.387038208


**Calculate and Save Embeddings**

## TODO: Add translation for non-en languages

In [15]:
%%time
mode = 'train'
if RE_EXTRACT_EMBEDDINGS:
    for k, (topic, content) in topic_content_match.items():
        
        topic, content = prepare_match_features(topic, content, CFG)
        p = f'/kaggle/working/embeddings/{mode}/{k}/'
        path = Path(p).expanduser()
        path.mkdir(parents=True, exist_ok=True)
        
        print(f'language: {k}')
        topic_embeddings = get_embeddings(topic['title'], CFG1)
        print(f'topic_embeddings shape: {topic_embeddings.shape}')
        np.save(path/'topic_embeddings.npy', topic_embeddings)
        del topic_embeddings, topic
        gc.collect()
        torch.cuda.empty_cache()
        !nvidia-smi
        
        content_embeddings = get_embeddings(content['title'], CFG1)
        print(f'content_embeddings shape: {content_embeddings.shape}')
        np.save(path/'content_embeddings.npy', content_embeddings)
        del content_embeddings, content
        gc.collect()
        torch.cuda.empty_cache()
        !nvidia-smi
        
        !ls /kaggle/embeddings/train/

language: bg


tokenization:   0%|          | 0/5 [00:00<?, ?it/s]

model output:   0%|          | 0/5 [00:00<?, ?it/s]

topic_embeddings shape: (2420, 384)
Tue Feb  7 18:05:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    33W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   43C    P0    33W /  70W |   1258

tokenization:   0%|          | 0/13 [00:00<?, ?it/s]

model output:   0%|          | 0/13 [00:00<?, ?it/s]

content_embeddings shape: (6050, 384)
Tue Feb  7 18:05:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    34W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   45C    P0    33W /  70W |   12

tokenization:   0%|          | 0/7 [00:00<?, ?it/s]

model output:   0%|          | 0/7 [00:00<?, ?it/s]

topic_embeddings shape: (3166, 384)
Tue Feb  7 18:05:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    34W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   46C    P0    33W /  70W |   1258

tokenization:   0%|          | 0/21 [00:00<?, ?it/s]

model output:   0%|          | 0/21 [00:00<?, ?it/s]

content_embeddings shape: (10435, 384)
Tue Feb  7 18:05:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    34W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   48C    P0    34W /  70W |   1

tokenization:   0%|          | 0/48 [00:00<?, ?it/s]

model output:   0%|          | 0/48 [00:00<?, ?it/s]

topic_embeddings shape: (23828, 384)
Tue Feb  7 18:05:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    35W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   51C    P0    35W /  70W |   125

tokenization:   0%|          | 0/132 [00:00<?, ?it/s]

model output:   0%|          | 0/132 [00:00<?, ?it/s]

content_embeddings shape: (65939, 384)
Tue Feb  7 18:06:14 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    36W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   56C    P0    36W /  70W |   1

tokenization:   0%|          | 0/4 [00:00<?, ?it/s]

model output:   0%|          | 0/4 [00:00<?, ?it/s]

topic_embeddings shape: (1809, 384)
Tue Feb  7 18:06:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    31W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   55C    P0    33W /  70W |   1258

tokenization:   0%|          | 0/8 [00:00<?, ?it/s]

model output:   0%|          | 0/8 [00:00<?, ?it/s]

content_embeddings shape: (3677, 384)
Tue Feb  7 18:06:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    36W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   56C    P0    36W /  70W |   12

tokenization:   0%|          | 0/2 [00:00<?, ?it/s]

model output:   0%|          | 0/2 [00:00<?, ?it/s]

topic_embeddings shape: (672, 384)
Tue Feb  7 18:06:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    29W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   55C    P0    28W /  70W |   1258M

tokenization:   0%|          | 0/8 [00:00<?, ?it/s]

model output:   0%|          | 0/8 [00:00<?, ?it/s]

content_embeddings shape: (3849, 384)
Tue Feb  7 18:06:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P0    36W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   57C    P0    36W /  70W |   12

tokenization:   0%|          | 0/7 [00:00<?, ?it/s]

model output:   0%|          | 0/7 [00:00<?, ?it/s]

topic_embeddings shape: (3173, 384)
Tue Feb  7 18:07:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    37W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   58C    P0    36W /  70W |   1258

tokenization:   0%|          | 0/15 [00:00<?, ?it/s]

model output:   0%|          | 0/15 [00:00<?, ?it/s]

content_embeddings shape: (7418, 384)
Tue Feb  7 18:07:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    37W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   60C    P0    37W /  70W |   12

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (66, 384)
Tue Feb  7 18:07:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P0    29W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   59C    P0    29W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (285, 384)
Tue Feb  7 18:07:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P0    29W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   59C    P0    29W /  70W |   125

tokenization:   0%|          | 0/7 [00:00<?, ?it/s]

model output:   0%|          | 0/7 [00:00<?, ?it/s]

topic_embeddings shape: (3034, 384)
Tue Feb  7 18:07:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    37W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   60C    P0    37W /  70W |   1258

tokenization:   0%|          | 0/22 [00:00<?, ?it/s]

model output:   0%|          | 0/22 [00:00<?, ?it/s]

content_embeddings shape: (10682, 384)
Tue Feb  7 18:07:43 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P0    38W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   62C    P0    36W /  70W |   1

tokenization:   0%|          | 0/20 [00:00<?, ?it/s]

model output:   0%|          | 0/20 [00:00<?, ?it/s]

topic_embeddings shape: (9949, 384)
Tue Feb  7 18:07:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0    38W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   63C    P0    37W /  70W |   1258

tokenization:   0%|          | 0/62 [00:00<?, ?it/s]

model output:   0%|          | 0/62 [00:00<?, ?it/s]

content_embeddings shape: (30844, 384)
Tue Feb  7 18:08:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    41W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    39W /  70W |   1

tokenization:   0%|          | 0/5 [00:00<?, ?it/s]

model output:   0%|          | 0/5 [00:00<?, ?it/s]

topic_embeddings shape: (2082, 384)
Tue Feb  7 18:08:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    35W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   68C    P0    39W /  70W |   1258

tokenization:   0%|          | 0/3 [00:00<?, ?it/s]

model output:   0%|          | 0/3 [00:00<?, ?it/s]

content_embeddings shape: (1447, 384)
Tue Feb  7 18:08:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    41W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    39W /  70W |   12

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (239, 384)
Tue Feb  7 18:08:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    31W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   67C    P0    30W /  70W |   1258M

tokenization:   0%|          | 0/2 [00:00<?, ?it/s]

model output:   0%|          | 0/2 [00:00<?, ?it/s]

content_embeddings shape: (999, 384)
Tue Feb  7 18:08:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    34W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   68C    P0    35W /  70W |   125

tokenization:   0%|          | 0/3 [00:00<?, ?it/s]

model output:   0%|          | 0/3 [00:00<?, ?it/s]

topic_embeddings shape: (1373, 384)
Tue Feb  7 18:09:04 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    34W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   68C    P0    39W /  70W |   1258

tokenization:   0%|          | 0/9 [00:00<?, ?it/s]

model output:   0%|          | 0/9 [00:00<?, ?it/s]

content_embeddings shape: (4042, 384)
Tue Feb  7 18:09:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    41W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    40W /  70W |   12

tokenization:   0%|          | 0/4 [00:00<?, ?it/s]

model output:   0%|          | 0/4 [00:00<?, ?it/s]

topic_embeddings shape: (1731, 384)
Tue Feb  7 18:09:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    41W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    40W /  70W |   1258

tokenization:   0%|          | 0/6 [00:00<?, ?it/s]

model output:   0%|          | 0/6 [00:00<?, ?it/s]

content_embeddings shape: (2513, 384)
Tue Feb  7 18:09:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    41W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    40W /  70W |   12

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (110, 384)
Tue Feb  7 18:09:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    30W /  70W |   1258M

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (206, 384)
Tue Feb  7 18:09:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    30W /  70W |   125

tokenization:   0%|          | 0/2 [00:00<?, ?it/s]

model output:   0%|          | 0/2 [00:00<?, ?it/s]

topic_embeddings shape: (722, 384)
Tue Feb  7 18:09:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   68C    P0    31W /  70W |   1258M

tokenization:   0%|          | 0/3 [00:00<?, ?it/s]

model output:   0%|          | 0/3 [00:00<?, ?it/s]

content_embeddings shape: (1300, 384)
Tue Feb  7 18:09:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    35W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    40W /  70W |   12

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (21, 384)
Tue Feb  7 18:10:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    30W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (188, 384)
Tue Feb  7 18:10:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    30W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (51, 384)
Tue Feb  7 18:10:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (326, 384)
Tue Feb  7 18:10:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (40, 384)
Tue Feb  7 18:10:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (184, 384)
Tue Feb  7 18:10:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (104, 384)
Tue Feb  7 18:10:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   1258M

tokenization:   0%|          | 0/2 [00:00<?, ?it/s]

model output:   0%|          | 0/2 [00:00<?, ?it/s]

content_embeddings shape: (505, 384)
Tue Feb  7 18:10:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (126, 384)
Tue Feb  7 18:11:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   1258M

tokenization:   0%|          | 0/2 [00:00<?, ?it/s]

model output:   0%|          | 0/2 [00:00<?, ?it/s]

content_embeddings shape: (641, 384)
Tue Feb  7 18:11:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    35W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (88, 384)
Tue Feb  7 18:11:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   1258Mi

tokenization:   0%|          | 0/2 [00:00<?, ?it/s]

model output:   0%|          | 0/2 [00:00<?, ?it/s]

content_embeddings shape: (501, 384)
Tue Feb  7 18:11:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1724MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (28, 384)
Tue Feb  7 18:11:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   69C    P0    31W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (319, 384)
Tue Feb  7 18:11:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (44, 384)
Tue Feb  7 18:11:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (216, 384)
Tue Feb  7 18:11:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (33, 384)
Tue Feb  7 18:12:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (495, 384)
Tue Feb  7 18:12:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (54, 384)
Tue Feb  7 18:12:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (245, 384)
Tue Feb  7 18:12:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   125

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

topic_embeddings shape: (26, 384)
Tue Feb  7 18:12:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   1258Mi

tokenization:   0%|          | 0/1 [00:00<?, ?it/s]

model output:   0%|          | 0/1 [00:00<?, ?it/s]

content_embeddings shape: (225, 384)
Tue Feb  7 18:12:43 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   125

In [16]:
!nvidia-smi

Tue Feb  7 18:12:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    32W /  70W |   1732MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   70C    P0    31W /  70W |   1258MiB / 15109MiB |      0%      Defaul

```
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<timed exec> in <module>

/tmp/ipykernel_23/1363623706.py in get_embeddings(data, cfg)
     21             inputs = batch_tokens['input_ids'].to(cfg.device)
     22             attention_mask = batch_tokens['attention_mask'].to(cfg.device)
---> 23             out = model(inputs, attention_mask=attention_mask).last_hidden_state.mean(1)
     24             outs.append(out.cpu().numpy())
     25     return np.concatenate(outs)

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
   1026             output_attentions=output_attentions,
   1027             output_hidden_states=output_hidden_states,
-> 1028             return_dict=return_dict,
   1029         )
   1030         sequence_output = encoder_outputs[0]

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
    612                     encoder_attention_mask,
    613                     past_key_value,
--> 614                     output_attentions,
    615                 )
    616 

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
    496             head_mask,
    497             output_attentions=output_attentions,
--> 498             past_key_value=self_attn_past_key_value,
    499         )
    500         attention_output = self_attention_outputs[0]

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
    428             encoder_attention_mask,
    429             past_key_value,
--> 430             output_attentions,
    431         )
    432         attention_output = self.output(self_outputs[0], hidden_states)

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
    325 
    326         # Take the dot product between "query" and "key" to get the raw attention scores.
--> 327         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
    328 
    329         if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":

RuntimeError: CUDA out of memory. Tried to allocate 11.72 GiB (GPU 0; 15.90 GiB total capacity; 4.49 GiB already allocated; 10.63 GiB free; 4.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
```

In [17]:
print_memory_usage()

RAM memory % used: 35.1
RAM Used (GB): 5.43301632


In [18]:
!tree /kaggle/working/

[01;34m/kaggle/working/[00m
├── __notebook__.ipynb
└── [01;34membeddings[00m
    └── [01;34mtrain[00m
        ├── [01;34mar[00m
        │   ├── content_embeddings.npy
        │   └── topic_embeddings.npy
        ├── [01;34mas[00m
        │   ├── content_embeddings.npy
        │   └── topic_embeddings.npy
        ├── [01;34mbg[00m
        │   ├── content_embeddings.npy
        │   └── topic_embeddings.npy
        ├── [01;34mbn[00m
        │   ├── content_embeddings.npy
        │   └── topic_embeddings.npy
        ├── [01;34men[00m
        │   ├── content_embeddings.npy
        │   └── topic_embeddings.npy
        ├── [01;34mes[00m
        │   ├── content_embeddings.npy
        │   └── topic_embeddings.npy
        ├── [01;34mfr[00m
        │   ├── content_embeddings.npy
        │   └── topic_embeddings.npy
        ├── [01;34mgu[00m
        │   ├── content_embeddings.npy
        │   └── topic_embeddings.npy
        ├── [01;34mhi[00m
   

## TODOs

### Annoy

In [19]:
# from annoy import AnnoyIndex
# import random

# f = 40  # Length of item vector that will be indexed

# t = AnnoyIndex(f, 'angular')
# for i in range(1000):
#     v = [random.gauss(0, 1) for z in range(f)]
#     t.add_item(i, v)

# t.build(10) # 10 trees
# t.save('test.ann')

# # ...

# u = AnnoyIndex(f, 'angular')
# u.load('test.ann') # super fast, will just mmap the file
# print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors

In [20]:
# from annoy import AnnoyIndex
# content_forest = AnnoyIndex(content_embeddings.shape[1], metric='angular')
# for i, item in tqdm(enumerate(content_embeddings), total=len(content_embeddings)):
#     content_forest.add_item(i, item)
# content_forest.build(100)

## Fuzzy?

### KNN?

## More

In [21]:
# df_content = pd.read_csv(CFG.content_dir)
# df_corr = pd.read_csv(CFG.correlation_dir)
# df_sub = pd.read_csv(CFG.submission_dir)
# df_topic = pd.read_csv(CFG.topic_dir)

In [22]:
# %%time
# content_cols = ['id', 'title', 'description', 'language']
# topic_cols = ['id', 'title', 'description', 'channel', 'category', 'language']
# df_train = merge_train_data(CFG, content_cols, topic_cols)
# df_train