In [13]:
import warnings
import pandas as pd
import numpy as np
import torch
import transformers

from tqdm.notebook import tqdm

tqdm.pandas()

In [14]:
etl = pd.read_parquet("/kaggle/input/hackathon-files-for-participants-ozon/train_data.parquet")
test_etl = pd.read_parquet("/kaggle/input/hackathon-files-for-participants-ozon/test_data.parquet")

In [15]:
etl = pd.concat([etl, test_etl])

In [16]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
MAX_LENGTH = 512
tokenized = etl['name'].progress_apply(
    lambda x: tokenizer(x,
                        add_special_tokens=True,
                        truncation=True,
                        max_length=MAX_LENGTH                        
                       ).input_ids)

  0%|          | 0/50 [00:00<?, ?it/s]

In [18]:
max_len = pd.DataFrame(tokenized)['name'].apply(len).max()
padded = np.array([i + [0]*(max_len - len(i)) for i in tqdm(tokenized.values)])

  0%|          | 0/50 [00:00<?, ?it/s]

In [19]:
padded = pd.DataFrame(padded, index = etl.variantid).reset_index()
padded = padded.rename(columns=dict(zip(padded.columns, padded.columns.astype(str))))

In [20]:
index = padded.variantid
padded = np.array(padded.drop('variantid', axis=1))

In [21]:
attention_mask = np.where(padded != 0, 1, 0)

In [22]:
cpu_device = torch.device('cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [23]:
bert_model = transformers.BertModel.\
        from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
batch_size = 256
embeddings = []
for i in tqdm(range(padded.shape[0] // batch_size + 1)): 
#     создаем батчи и грузим сразу в нужное устройство (GPU или CPU)
    batch = torch.LongTensor(
            padded[batch_size * i:batch_size * (i + 1)]
        ).to(device)
#     то же самое с attention_mask
    attention_mask_batch = torch.LongTensor(
            attention_mask[batch_size * i:batch_size * (i + 1)]
        ).to(device)
#         далее магия происходит
    with torch.no_grad():
        batch_embeddings = bert_model(batch,
                                      attention_mask=attention_mask_batch)
#     вырезаем аккуратненько определенный срез и склеиваем
    embeddings.append(batch_embeddings[0][:,0,:].to(cpu_device).numpy())

  0%|          | 0/11 [00:00<?, ?it/s]

In [25]:
f = np.concatenate(embeddings)

In [27]:
to_save = pd.DataFrame(f, index = index).reset_index()

In [28]:
emb = pd.concat([to_save.variantid, 
                 pd.Series(list(
                     to_save.drop('variantid', axis=1).to_numpy()
                 ), name='name_bert768')], axis=1)

In [29]:
emb.reset_index(drop=True).to_parquet('name_bert768.parquet')