In [1]:
!pip install -q numpy pandas tqdm pymystem3 tensorflow torch transformers qdrant-client ranx

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 21.12.2 requires cupy-cuda115, which is not installed.
cudf 21.12.2 requires cupy-cuda115, which is not installed.
xarray-einstats 0.2.2 requires numpy>=1.21, but you have numpy 1.20.3 which is incompatible.
tfx-bsl 1.12.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.79.0 which is incompatible.
tfx-bsl 1.12.0 requires pyarrow<7,>=6, but you have pyarrow 5.0.0 which is incompatible.
tensorflow-transform 1.12.0 requires pyarrow<7,>=6, but you have pyarrow 5.0.0 which is incompatible.
onnx 1.13.1 requires protobuf<4,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.
librosa 0.10.0 requires soundfile>=0.12.1, but you have soundfile 0.11.0 which is incompatible.
featuretools 1.11.1 requires numpy>=1.21.0, but you have numpy 1.20.3 which is incompatible.

In [2]:
import shutil

import numpy as np
import pandas as pd

import torch
from torch import nn, optim

from transformers import AutoTokenizer, AutoModel
from transformers import logging
logging.set_verbosity_error()

from functools import partial

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from qdrant_client.http.models import CollectionStatus

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

from ranx import Qrels, Run, evaluate, compare

from tqdm import tqdm
from copy import deepcopy

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

DIR = '/kaggle/input/aaa-project-search/'

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
cuda:0


In [3]:
df = pd.read_hdf(DIR + 'search_relevance_dataset_v1.hdf', 'table')
df.drop(columns=['query_category_id', 'query_microcat_id', 'query_location_id'], inplace=True)

df.query_id = df.query_id.astype(str)
df.item_id = df.item_id.astype(str)

df.head(3)

Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target
0,274025,2108 ссср,964140459,Советские бутыли канистры 60-80-х СССР ваз газ...,Для ценителей и понимающих.\n\nПодробные фотог...,стопсфинксstop,0
1,274025,2108 ссср,990433426,Ваз 2108 СССР цвет салатовый 1/43 идеальный,красивый салатовый цвет\nвсе детали в наличии\...,модель,1
2,274025,2108 ссср,994402610,Модели советских машин ваз 2102 почта М 1/43 №10,Продается модель автомобиля ВАЗ 2102 почта . ...,модель,1


In [4]:
# Create lemmatizer and stopwords list
mystem = Mystem() 
russian_stopwords = stopwords.words('russian')

# Preprocess function
def preprocess_text(text):
    text = ' '.join(text.lower().split())

#     tokens = mystem.lemmatize(text)
#     tokens = [token for token in tokens if token not in russian_stopwords\
#               and token != ' ' and token.strip() not in punctuation]

#     text = ' '.join(tokens)

    return text

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [5]:
df.query_text = df.query_text.apply(preprocess_text)
df.title = df.title.apply(preprocess_text)

df.head(3)

Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target
0,274025,2108 ссср,964140459,советские бутыли канистры 60-80-х ссср ваз газ...,Для ценителей и понимающих.\n\nПодробные фотог...,стопсфинксstop,0
1,274025,2108 ссср,990433426,ваз 2108 ссср цвет салатовый 1/43 идеальный,красивый салатовый цвет\nвсе детали в наличии\...,модель,1
2,274025,2108 ссср,994402610,модели советских машин ваз 2102 почта м 1/43 №10,Продается модель автомобиля ВАЗ 2102 почта . ...,модель,1


In [6]:
client = QdrantClient(':memory:')

In [7]:
def index_dataset(client, df, model, vector_size, collection_name='collection'):
    client.recreate_collection(
        collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )
    df = df.drop_duplicates(subset=['item_id'])

    points = []
    for row in tqdm(df.itertuples(), total=len(df)):
        points.append(
            PointStruct(
                id=row.Index,
                vector=list(map(float, model(row.title))), # list(map(float.. to make proper type
                payload={
                    'title': row.title, 'description': row.description, 'keywords': row.keywords, 'item_id': row.item_id
                },
            )
        )

    operation_info = client.upsert(collection_name=collection_name, wait=True, points=points)

    return operation_info

In [8]:
# random model
random_model = lambda text: np.random.randn(32)

# baseline model 1
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
model = AutoModel.from_pretrained('cointegrated/rubert-tiny2').to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

baseline_model_1 = partial(embed_bert_cls, model=model, tokenizer=tokenizer)

# baseline model 2
tokenizer = AutoTokenizer.from_pretrained('cointegrated/LaBSE-en-ru')
model = AutoModel.from_pretrained('cointegrated/LaBSE-en-ru').to(device)

def embed_labse_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

baseline_model_2 = partial(embed_labse_cls, model=model, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/118M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/516M [00:00<?, ?B/s]

In [9]:
baseline_model_1('hello').shape, baseline_model_2('hello').shape # ((312,), (768,))

((312,), (768,))

In [10]:
class ProjectorModel(nn.Module):
    def __init__(self, model_name: str = 'cointegrated/rubert-tiny2', final_emb_size: int = 32):
        super().__init__()

        self.model_name = model_name
        self.final_emb_size = final_emb_size
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.backbone = AutoModel.from_pretrained(self.model_name, output_hidden_states=True).to(device)

        for n, p in self.backbone.named_parameters():
            p.requires_grad = False

        self.initial_emd_size = 312 if self.model_name == 'cointegrated/rubert-tiny2' else 768

        self.projection_head = nn.Sequential(
            nn.Linear(self.initial_emd_size, self.final_emb_size, device=device),            
        )

    def backbone_forward(self, text):
        t = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        model_output = self.backbone(**{k: v.to(self.backbone.device) for k, v in t.items()})

#         embeddings = model_output.pooler_output # torch.concat([model_output['hidden_states'][-i].mean(dim=1) for i in range(1, 5, 1)], dim=1)

        embeddings = model_output.last_hidden_state[:, 0, :] if self.model_name == 'cointegrated/rubert-tiny2' else model_output.pooler_output
        embeddings = nn.functional.normalize(embeddings)

        return embeddings

    def forward(self, text):
        embeddings = self.backbone_forward(text)

        compressed_embeddings = self.projection_head(embeddings)
        compressed_embeddings = nn.functional.normalize(compressed_embeddings)

        return compressed_embeddings

ProjectorModel()('some text here').shape, ProjectorModel()(['some text here', 'lalala']).shape

(torch.Size([1, 32]), torch.Size([2, 32]))

In [11]:
def train_eval(model, df_train, df_test, batch_size=64, n_epochs=5, lr=1e-5):
    model.train()

    queries = df_train.query_id.unique()
    df_train = df_train.set_index('query_id')

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)

    for e in range(n_epochs):
        train_set = df_train.loc[np.random.choice(queries, size=len(queries), replace=False), ['query_text', 'title', 'target', 'query_len', 'keywords']].values
        num_batches = len(train_set) // batch_size

        for i in tqdm(range(num_batches)):
            start, end = i * batch_size, (i + 1) * batch_size
            batch = train_set[start:end]

            x1, x2, y = list(batch[:, 0]), list(batch[:, 1]), torch.tensor(batch[:, 2].astype(int), device=device)

            loss = 0.5 * ((y - (model.backbone_forward(x1) * model.backbone_forward(x2)).sum(dim=1))**2).mean() +\
                   0.5 * ((y - (model(x1) * model(x2)).sum(dim=1))**2).mean()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            optimizer.zero_grad()

        if e == 0:
            for n, p in model.named_parameters():
                if n[:5] != 'embed':
                    p.requires_grad = True

            optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-2)

        model.eval()

        test_set = df_test.loc[:, ['query_text', 'title', 'target', 'query_len', 'keywords']].values
        num_batches = len(test_set) // batch_size

        pred1, pred2, true = [], [], []
        for i in tqdm(range(num_batches + 1)):
            start, end = i * batch_size, (i + 1) * batch_size
            batch = test_set[start:end]

            x1, x2, y = list(batch[:, 0]), list(batch[:, 1]), batch[:, 2].astype(int)

            pred1.extend((model.backbone_forward(x1) * model.backbone_forward(x2)).sum(dim=1).abs().cpu().detach().numpy())
            pred2.extend((model(x1) * model(x2)).sum(dim=1).abs().cpu().detach().numpy())
            true.extend(y)

        print(f'Epoch {e}:')
        print(f'Test ROC-AUC backbone: {roc_auc_score(true, pred1):.3f}')
        print(f'Test ROC-AUC full net: {roc_auc_score(true, pred2):.3f}')

        df_test['true'] = true
        df_test['pred1'] = pred1
        df_test['pred2'] = pred2

        print('Test ROC-AUC by query length:')
        display(df_test.groupby('query_len').apply(
            lambda d: [f'{roc_auc_score(d.true, d.pred1):.3f}', f'{roc_auc_score(d.true, d.pred2):.3f}']
        ))

        model.train()

In [12]:
df['query_len'] = df['query_text'].apply(lambda x: len(x.split(' ')))
df.loc[df.query_len > 3, 'query_len'] = 3
df.query_len.value_counts()

3    62688
2    62398
1    36343
Name: query_len, dtype: int64

In [20]:
df.title = 'объявление: ' + df.title
df.query_text = 'запрос: ' + df.query_text
df.head()

Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target,query_len
0,274025,запрос: 2108 ссср,964140459,объявление: советские бутыли канистры 60-80-х ...,Для ценителей и понимающих.\n\nПодробные фотог...,стопсфинксstop,0,2
1,274025,запрос: 2108 ссср,990433426,объявление: ваз 2108 ссср цвет салатовый 1/43 ...,красивый салатовый цвет\nвсе детали в наличии\...,модель,1,2
2,274025,запрос: 2108 ссср,994402610,объявление: модели советских машин ваз 2102 по...,Продается модель автомобиля ВАЗ 2102 почта . ...,модель,1,2
3,274025,запрос: 2108 ссср,1069135877,объявление: книга автомобиль ваз-2108 ссср,Спутник СССР ВАЗ устройство автомобиля 2108 19...,учебный литература,1,2
4,274025,запрос: 2108 ссср,1217235061,"объявление: плакаты, ваз 2108, ссср, 1988 год",Продам комплект плакатов по устройству автомоб...,учебный литература,0,2


In [21]:
%%time

kf = GroupKFold(n_splits=2)

for train_indices, test_indices in kf.split(X=df, groups=df.query_id):    
    df_train, df_test = df.loc[train_indices], df.loc[test_indices]
    print('-' * 80)
    print('Train:', df_train.shape, df_train.query_id.nunique(), '  ',
          'Test:', df_test.shape, df_test.query_id.nunique(), '  ',
          'Intersection:', set(df_train.query_id).intersection(set(df_test.query_id)))
    print()

    runs = []
    for model_name in ['cointegrated/rubert-tiny2']: # ['cointegrated/rubert-tiny2', 'cointegrated/LaBSE-en-ru']:
        for final_emb_size in [64]:
            print('Processing...', model_name, final_emb_size)

            model = ProjectorModel(model_name=model_name, final_emb_size=final_emb_size)
            train_eval(model, df_train, df_test, batch_size=512, n_epochs=1+15, lr=1e-5)

            model.eval()

            final_model = lambda text: model(text).cpu().detach().numpy()[0]
            _ = index_dataset(client, df, final_model, vector_size=final_emb_size,
                              collection_name=f'train_collection_{model_name}_{final_emb_size}')

            qrels = Qrels.from_df(df_test, q_id_col='query_id', doc_id_col='item_id', score_col='target')
            test_examples = df_test.drop_duplicates(subset=['query_id', 'query_text']).loc[:, ['query_id', 'query_text']]

            run_dict = {}
            for row in tqdm(test_examples.itertuples(), total=len(test_examples)):
                search_result = client.search(
                    collection_name=f'train_collection_{model_name}_{final_emb_size}', 
                    query_vector=list(map(float, final_model(row.query_text))), limit=50
                )
                run_dict[row.query_id] = {i.payload['item_id']: i.score for i in search_result}

            run = Run(run_dict) # print(evaluate(qrels, run, ['map@10', 'map@50', 'ndcg@10', 'ndcg@50']))
            runs.append(run)

            client.delete_collection(collection_name=f'train_collection_{model_name}_{final_emb_size}')
            print('Done')

    report = compare(
        qrels=qrels, runs=runs,
        metrics=['map@10', 'map@50', 'ndcg@10', 'ndcg@50'],
        n_permutations=1000, stat_test='student', max_p=0.01,
    )
    print(); print(report); print()

    break

# 0.752 - base
# 0.788-0.789 8-9 epochs - new loss
# 0.790 8-9 epochs - normalize both

# 0.789 (0.803 backbone) 9-11 epochs - 1/1 loss
# 0.791-0.792 (0.804-0.805 backbone) 10-11 epochs - 0.5 / 0.5 loss (batch 256 as I remember)

# 0.805 (0.806 backbone) - 10 epochs - lowercase preprocessing
# 0.805 (0.813 backbone) - 15 epochs - special tokens prior

--------------------------------------------------------------------------------
Train: (80714, 8) 6172    Test: (80715, 8) 6173    Intersection: set()

Processing... cointegrated/rubert-tiny2 64


100%|██████████| 157/157 [00:24<00:00,  6.53it/s]
100%|██████████| 158/158 [00:27<00:00,  5.80it/s]


Epoch 0:
Test ROC-AUC backbone: 0.667
Test ROC-AUC full net: 0.657
Test ROC-AUC by query length:


query_len
1    [0.687, 0.677]
2    [0.648, 0.633]
3    [0.702, 0.692]
dtype: object

100%|██████████| 157/157 [00:47<00:00,  3.30it/s]
100%|██████████| 158/158 [00:26<00:00,  5.93it/s]


Epoch 1:
Test ROC-AUC backbone: 0.739
Test ROC-AUC full net: 0.734
Test ROC-AUC by query length:


query_len
1    [0.755, 0.746]
2    [0.705, 0.695]
3    [0.778, 0.772]
dtype: object

100%|██████████| 157/157 [00:47<00:00,  3.34it/s]
100%|██████████| 158/158 [00:26<00:00,  5.89it/s]


Epoch 2:
Test ROC-AUC backbone: 0.760
Test ROC-AUC full net: 0.759
Test ROC-AUC by query length:


query_len
1    [0.769, 0.760]
2    [0.725, 0.722]
3    [0.796, 0.793]
dtype: object

100%|██████████| 157/157 [00:46<00:00,  3.36it/s]
100%|██████████| 158/158 [00:26<00:00,  5.99it/s]


Epoch 3:
Test ROC-AUC backbone: 0.771
Test ROC-AUC full net: 0.769
Test ROC-AUC by query length:


query_len
1    [0.773, 0.765]
2    [0.736, 0.735]
3    [0.805, 0.802]
dtype: object

100%|██████████| 157/157 [00:47<00:00,  3.33it/s]
100%|██████████| 158/158 [00:27<00:00,  5.81it/s]


Epoch 4:
Test ROC-AUC backbone: 0.780
Test ROC-AUC full net: 0.779
Test ROC-AUC by query length:


query_len
1    [0.781, 0.772]
2    [0.747, 0.747]
3    [0.812, 0.809]
dtype: object

100%|██████████| 157/157 [00:46<00:00,  3.37it/s]
100%|██████████| 158/158 [00:26<00:00,  5.86it/s]


Epoch 5:
Test ROC-AUC backbone: 0.786
Test ROC-AUC full net: 0.784
Test ROC-AUC by query length:


query_len
1    [0.784, 0.775]
2    [0.755, 0.753]
3    [0.818, 0.815]
dtype: object

100%|██████████| 157/157 [00:46<00:00,  3.37it/s]
100%|██████████| 158/158 [00:26<00:00,  5.88it/s]


Epoch 6:
Test ROC-AUC backbone: 0.789
Test ROC-AUC full net: 0.784
Test ROC-AUC by query length:


query_len
1    [0.782, 0.773]
2    [0.760, 0.755]
3    [0.822, 0.818]
dtype: object

100%|██████████| 157/157 [00:46<00:00,  3.36it/s]
100%|██████████| 158/158 [00:26<00:00,  5.87it/s]


Epoch 7:
Test ROC-AUC backbone: 0.798
Test ROC-AUC full net: 0.795
Test ROC-AUC by query length:


query_len
1    [0.792, 0.782]
2    [0.768, 0.766]
3    [0.828, 0.824]
dtype: object

100%|██████████| 157/157 [00:46<00:00,  3.38it/s]
100%|██████████| 158/158 [00:26<00:00,  5.87it/s]


Epoch 8:
Test ROC-AUC backbone: 0.801
Test ROC-AUC full net: 0.796
Test ROC-AUC by query length:


query_len
1    [0.791, 0.779]
2    [0.774, 0.770]
3    [0.831, 0.827]
dtype: object

100%|██████████| 157/157 [00:46<00:00,  3.40it/s]
100%|██████████| 158/158 [00:26<00:00,  5.91it/s]


Epoch 9:
Test ROC-AUC backbone: 0.800
Test ROC-AUC full net: 0.794
Test ROC-AUC by query length:


query_len
1    [0.787, 0.775]
2    [0.774, 0.767]
3    [0.832, 0.827]
dtype: object

100%|██████████| 157/157 [00:47<00:00,  3.32it/s]
100%|██████████| 158/158 [00:27<00:00,  5.81it/s]


Epoch 10:
Test ROC-AUC backbone: 0.806
Test ROC-AUC full net: 0.801
Test ROC-AUC by query length:


query_len
1    [0.794, 0.783]
2    [0.779, 0.774]
3    [0.835, 0.831]
dtype: object

100%|██████████| 157/157 [00:46<00:00,  3.35it/s]
100%|██████████| 158/158 [00:27<00:00,  5.84it/s]


Epoch 11:
Test ROC-AUC backbone: 0.808
Test ROC-AUC full net: 0.803
Test ROC-AUC by query length:


query_len
1    [0.794, 0.783]
2    [0.782, 0.776]
3    [0.838, 0.833]
dtype: object

100%|██████████| 157/157 [00:47<00:00,  3.32it/s]
100%|██████████| 158/158 [00:26<00:00,  6.02it/s]


Epoch 12:
Test ROC-AUC backbone: 0.809
Test ROC-AUC full net: 0.803
Test ROC-AUC by query length:


query_len
1    [0.794, 0.781]
2    [0.784, 0.775]
3    [0.840, 0.834]
dtype: object

100%|██████████| 157/157 [00:47<00:00,  3.34it/s]
100%|██████████| 158/158 [00:27<00:00,  5.75it/s]


Epoch 13:
Test ROC-AUC backbone: 0.807
Test ROC-AUC full net: 0.798
Test ROC-AUC by query length:


query_len
1    [0.788, 0.774]
2    [0.783, 0.773]
3    [0.840, 0.833]
dtype: object

100%|██████████| 157/157 [00:46<00:00,  3.34it/s]
100%|██████████| 158/158 [00:26<00:00,  5.92it/s]


Epoch 14:
Test ROC-AUC backbone: 0.808
Test ROC-AUC full net: 0.800
Test ROC-AUC by query length:


query_len
1    [0.788, 0.774]
2    [0.783, 0.774]
3    [0.841, 0.834]
dtype: object

100%|██████████| 157/157 [00:47<00:00,  3.34it/s]
100%|██████████| 158/158 [00:26<00:00,  5.91it/s]


Epoch 15:
Test ROC-AUC backbone: 0.813
Test ROC-AUC full net: 0.805
Test ROC-AUC by query length:


query_len
1    [0.791, 0.779]
2    [0.788, 0.779]
3    [0.844, 0.838]
dtype: object

100%|██████████| 150268/150268 [12:03<00:00, 207.59it/s]
100%|██████████| 6173/6173 [12:41<00:00,  8.10it/s]


Done

#    Model      MAP@10    MAP@50    NDCG@10    NDCG@50
---  -------  --------  --------  ---------  ---------
a    run_1        0.09      0.11      0.158      0.175

CPU times: user 51min 51s, sys: 8min 57s, total: 1h 49s
Wall time: 45min 24s


In [22]:
model.backbone.save_pretrained('model_tuned/')
model.tokenizer.save_pretrained('model_tuned/')

torch.save(model, 'model_tuned/model_tuned_full.pkl')

shutil.make_archive('model_tuned', 'zip', 'model_tuned/')

'/kaggle/working/model_tuned.zip'

In [23]:
model = torch.load('model_tuned/model_tuned_full.pkl', map_location=device)
model.eval()

ProjectorModel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affin