In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.2 MB/s[0m eta [36m0:00:0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Обработка данных

Здесь ведется предварительная очистка данных, удаление нулевых значений и приведение к требуемому формату датасета.

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import random
from sklearn.model_selection import train_test_split
def get_contradiction(source, entailment):
  result = entailment
  #ind = random.choice(source.index)
  while result == entailment:
    result = source.sample(n=1)["ANSWER"].item()
    #ind = random.randint(0, len(source))
  return result
def preprocess_data():
  base = pd.read_excel("/content/drive/MyDrive/mfc_bot/base.xlsx")
  #poor = pd.read_excel("/content/drive/MyDrive/mfc_bot/poor.xlsx")
  qa = pd.read_excel("/content/drive/MyDrive/mfc_bot/qa.xlsx")
  qa = qa[qa["ANSWER"].isna() == False]
  qa["ancor"] = qa['QUESTION']
  qa["entailment"] = qa["ANSWER"]
  qa["contradiction"] = qa["entailment"].apply(lambda x: get_contradiction(qa, x))
  qa["neutral"] = qa["entailment"].apply(lambda x: "")
  qa = qa.drop(columns=["QUESTION", "ANSWER"])
  return qa

Базовый класс датасета для трех экспериментов

In [None]:
class MFCDataset(Dataset):
  SPLIT_RANDOM_SEED = 42
  TEST_SIZE = 0.25

  def __init__(self, df, tokenizer, model_version=1, maxlen=512, train=True):
    super().__init__()
    self.df = df
    self.dataset = []
    for index, row in df.iterrows():
      ind = index
      item_pos = {}
      item_pos["first"] = row["ancor"]
      item_pos["second"] = row["entailment"]
      item_pos["label"] = 1
      item_neg = {}
      item_neg["first"] = row["ancor"]
      item_neg["second"] = row["contradiction"]
      item_neg["label"] = 0
      self.dataset.append(item_pos)
      self.dataset.append(item_neg)
    random.shuffle(self.dataset)
    self.tokenizer = tokenizer
    self.maxlen = maxlen
    self.model_version = model_version

    if train:
      self.dataset = train_test_split(self.dataset, random_state=self.SPLIT_RANDOM_SEED, test_size=self.TEST_SIZE)[0]
    else:
      self.dataset = train_test_split(self.dataset, random_state=self.SPLIT_RANDOM_SEED, test_size=self.TEST_SIZE)[1]
  def __len__(self):
    return len(self.dataset)
  def __getitem__(self, item):
    if self.model_version == 1:
      first = self.dataset[item]["first"]
      first_toks = self.tokenizer([first], padding=True, truncation=True, max_length=int(self.maxlen/2), return_tensors='pt')
      second = self.dataset[item]["second"]
      second_toks = self.tokenizer([second], padding=True, truncation=True, max_length=int(self.maxlen/2-1), return_tensors='pt')
      result = {}
      common_len = len(first_toks['input_ids'][0].tolist() + second_toks['input_ids'][0].tolist()[1:])
      result['input_ids'] = first_toks['input_ids'][0].tolist() + second_toks['input_ids'][0].tolist()[1:] + [0]*(self.maxlen-common_len)
      result['token_type_ids'] = first_toks['token_type_ids'][0].tolist() + second_toks['token_type_ids'][0].tolist()[1:] + [0]*(self.maxlen-common_len)
      result['attention_mask'] = first_toks['attention_mask'][0].tolist() + second_toks['attention_mask'][0].tolist()[1:] + [0]*(self.maxlen-common_len)
      label = self.dataset[item]["label"]
      #return result, label
      return {
        'input_ids': torch.tensor(result['input_ids']).flatten(),
        'attention_mask': torch.tensor(result['attention_mask']).flatten(),
        'targets': torch.tensor(label, dtype=torch.long)
      }
    elif self.model_version == 2:
      first = self.dataset[item]["first"]
      first_toks = self.tokenizer([first], truncation=True, max_length=self.maxlen, pad_to_max_length=True,return_tensors='pt')
      second = self.dataset[item]["second"]
      second_toks = self.tokenizer([second], truncation=True, max_length=self.maxlen, pad_to_max_length=True,return_tensors='pt')
      label = self.dataset[item]["label"]
      return {
          "input_ids_1":first_toks["input_ids"].flatten(),
          "input_ids_2":second_toks["input_ids"].flatten(),
          "attention_mask_1":first_toks["attention_mask"].flatten(),
          "attention_mask_2":second_toks["attention_mask"].flatten(),
          "target":torch.tensor(label, dtype=torch.float)
      }

#SBERT

Базовые модели SentenceBERT для первых двух экспериментов.


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn

Эксперимент 1: на вход модели подаются 2 конкатенированных предложения, на выходе решается задача бинарной классификации, похожи предложения или нет.

In [None]:
class SBERT(nn.Module):
  def __init__(self, bert, n_classes):
    super(SBERT, self).__init__()
    self.bert = bert
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(outputs["pooler_output"])
    return self.out(output)

Эксперимент 2: обучение модели на основе оценки косинусного расстояния, минимизируется расстояние между положительными примерами и максимизируется между отрицательными.

In [None]:
class SBERT_v2(nn.Module):
  def __init__(self, bert1):
    super(SBERT_v2, self).__init__()
    self.bert1 = bert1
    #self.bert2 = bert2
    self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)

  def forward(self, first, second):
    outputs_1 = self.bert1(
      input_ids=first[0],
      attention_mask=first[1]
    )
    outputs_2 = self.bert1(
      input_ids=second[0],
      attention_mask=second[1]
    )
    return self.cos(outputs_1["pooler_output"], outputs_2["pooler_output"])

#Training

##V.1

Ход первого эксперимента

In [None]:
from tqdm.notebook import tqdm

In [None]:
!pip install numba
from numba import cuda
Device = cuda.get_current_device()
Device.reset()



In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = SBERT(model, 2)

In [None]:
device = "cuda"
model.to(device)

SBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwi

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0

  for d in tqdm(data_loader):
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    """input_ids = []
    attention_mask = []
    for i in x:
      input_ids.append(i["input_ids"])
      attention_mask.append(i["attention_mask"])

    input_ids = torch.tensor(input_ids).to(device)
    attention_mask = torch.tensor(attention_mask).to(device)
    targets = y.to(device)"""

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in tqdm(data_loader):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

In [None]:
EPOCHS = 10
lr = 2e-5
batch_size=16
max_sequence_length = 64
#Adam учился на стандартных значениях параметров, т.к. они оптимальные

In [None]:
train_dataset = MFCDataset(qa, tokenizer, maxlen=64, train=True)
test_dataset = MFCDataset(qa, tokenizer, maxlen=64, train=False)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
optimizer = AdamW(model.parameters(), lr=lr)
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
from collections import defaultdict
import numpy as np

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(0, EPOCHS):
  epoch = int(epoch.item())
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_dataloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(train_dataset)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    test_dataloader,
    loss_fn,
    device,
    len(test_dataset)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc
#123456789

Epoch 1/10
----------




  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.26476300256035756 accuracy 0.9017835328609822


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.1493052834493303 accuracy 0.9501831501831501

Epoch 2/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.17847913180139585 accuracy 0.948204251160518


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.11649775869472949 accuracy 0.9692307692307692

Epoch 3/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.12575533974359132 accuracy 0.9662838993403372


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.10110838402490413 accuracy 0.9743589743589743

Epoch 4/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.08519532059631274 accuracy 0.9767896408502321


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.11127827085180987 accuracy 0.9750915750915751

Epoch 5/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.07550035849362757 accuracy 0.980698753970193


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.1006195496464538 accuracy 0.9787545787545787

Epoch 6/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.0545692383639107 accuracy 0.9848521866601515


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.11687514115887056 accuracy 0.9772893772893773

Epoch 7/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.03939645680100057 accuracy 0.9887612997801124


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.12451928400053497 accuracy 0.9787545787545787

Epoch 8/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.03576912667307397 accuracy 0.9897385780601027


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.11018782233407175 accuracy 0.9802197802197802

Epoch 9/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.018918603001679912 accuracy 0.9929147324700709


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.10621552926445581 accuracy 0.9831501831501831

Epoch 10/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.018887595159014836 accuracy 0.9926704129000733


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.10621685153015532 accuracy 0.9831501831501831

Epoch 11/10
----------


  0%|          | 0/256 [00:00<?, ?it/s]

Train loss 0.01895209270031728 accuracy 0.993403371610066


  0%|          | 0/86 [00:00<?, ?it/s]

Val   loss 0.12021596482880467 accuracy 0.9831501831501831

CPU times: user 34min 27s, sys: 2min 3s, total: 36min 30s
Wall time: 38min 40s


In [None]:
torch.save(model, "model_v1.pt")

In [None]:
bert = model.bert

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

sentences = []
for index, row in qa.iterrows():
  sentences.append(row["entailment"])

"""#Sentences we want sentence embeddings for
sentences = ['Привет! Как твои дела?',
             'А правда, что 42 твое любимое число?']

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])"""

'#Sentences we want sentence embeddings for\nsentences = [\'Привет! Как твои дела?\',\n             \'А правда, что 42 твое любимое число?\']\n\n#Load AutoModel from huggingface model repository\ntokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")\nmodel = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")\n\n#Tokenize sentences\nencoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors=\'pt\')\n\n#Compute token embeddings\nwith torch.no_grad():\n    model_output = model(**encoded_input)\n\n#Perform pooling. In this case, mean pooling\nsentence_embeddings = mean_pooling(model_output, encoded_input[\'attention_mask\'])'

In [None]:
cpu = "cpu"

In [None]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=64, return_tensors='pt')
encoded_input = encoded_input.to(device)
bert.to(cpu)
with torch.no_grad():
    model_output = bert(**encoded_input)
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

KeyboardInterrupt: ignored

In [None]:
device = "cuda"

In [None]:
len(embedings)

2729

In [None]:
len(sentences)

2729

In [None]:
q = qa.iloc[1]["ancor"]

In [None]:
q

'Какие основания для получения предоставления дополнительной меры социальной поддержки в виде единовременной компенсационной выплаты при рождении ребенка (усыновлении в возрасте до шести месяцев) для приобретения предметов детского ассортимента и продуктов детского питания?'

In [None]:
a = qa.iloc[1]["entailment"]

In [None]:
a

'Заявителем является родитель (усыновитель) ребенка (детей), являющийся гражданином Российской Федерации, имеющим место жительства в Санкт-Петербурге, в случае если ребенок (дети) в отношении которых подается заявление являются гражданами Российской Федерации, имеющими место жительства в Санкт-Петербурге.'

In [None]:
q_enc = tokenizer([q], padding=True, truncation=True, max_length=512, return_tensors='pt')
q_enc = q_enc.to(device)
with torch.no_grad():
    model_output = bert(**q_enc)
q_emb = mean_pooling(model_output, q_enc['attention_mask'])

In [None]:
from numpy import dot
from numpy.linalg import norm

In [None]:
def cosine_similarity(a, b):
  cos_sim = dot(a, b)/(norm(a)*norm(b))
  return cos_sim

In [None]:
def top_3(source, base):
  top_1 = {"max":-10, "ind":None}
  top_2 = {"max":-10, "ind":None}
  top_3 = {"max":-10, "ind":None}
  for ind, s in enumerate(source):
    cos_sim = cosine_similarity(s, base)
    if cos_sim > top_1["max"]:
      top_1["max"] = cos_sim
      top_1["ind"] = ind
      continue
    elif cos_sim > top_2["max"]:
      top_2["max"] = cos_sim
      top_2["ind"] = ind
      continue
    elif cos_sim > top_3["max"]:
      top_3["max"] = cos_sim
      top_3["ind"] = ind
      continue
  return top_1, top_2, top_3

In [None]:
tops = top_3(embedings, q_emb[0].tolist())

In [None]:
sentences[tops[0]["ind"]]

'При заполнении заявления формируется список детских садов, из которых заявитель может выбрать не более трех, расположенных в одном районе Санкт-Петербурга: первое из выбранных является приоритетным, другие - дополнительными.'

In [None]:
embedings = []
size = 128
cur = 0
for ind, sentence in enumerate(sentences):
  subs = sentences[cur:cur+size] if size < len(sentences) else sentences[cur:]
  encoded_input = tokenizer(subs, padding=True, truncation=True, max_length=512, return_tensors='pt')
  encoded_input = encoded_input.to(device)
  bert.to(device)
  with torch.no_grad():
    model_output = bert(**encoded_input)
  emb = mean_pooling(model_output, encoded_input['attention_mask'])
  embedings = embedings + emb.tolist()
  cur += size
  print(cur)
  if cur >= len(sentences):
    break

128
256
384
512
640
768
896
1024
1152
1280
1408
1536
1664
1792
1920
2048
2176
2304
2432
2560
2688
2816


In [None]:
!cp /content/best_model_state.bin /content/drive/MyDrive/mfc_bot

In [None]:
model = torch.load("/content/drive/MyDrive/mfc_bot/model_v1.pt")

In [None]:
bert = model.bert

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")

##V.2

In [None]:
model_v2_1 = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
#model_v2_2 = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model_v2 = SBERT_v2(model_v2_1)

In [None]:
device = "cuda"
model_v2.to(device)

SBERT_v2(
  (bert1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, eleme

In [None]:
qa = preprocess_data()

In [None]:
batch_size = 32
train_dataset = MFCDataset(qa, tokenizer, model_version=2, maxlen=64, train=True)
test_dataset = MFCDataset(qa, tokenizer,  model_version=2,maxlen=64, train=False)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0

  for d in tqdm(data_loader):
    input_ids_1 = d["input_ids_1"].to(device)
    attention_mask_1 = d["attention_mask_1"].to(device)
    input_ids_2 = d["input_ids_2"].to(device)
    attention_mask_2 = d["attention_mask_2"].to(device)
    targets = d["target"].to(device)

    outputs = model(
      first=(input_ids_1, attention_mask_1),
      second=(input_ids_2,attention_mask_2)
    )

    loss = loss_fn(outputs, targets)

    losses.append(loss.item()/batch_size)

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []

  with torch.no_grad():
    for d in tqdm(data_loader):
      input_ids_1 = d["input_ids_1"].to(device)
      attention_mask_1 = d["attention_mask_1"].to(device)
      input_ids_2 = d["input_ids_2"].to(device)
      attention_mask_2 = d["attention_mask_2"].to(device)
      targets = d["target"].to(device)

      outputs = model(
        first=(input_ids_1, attention_mask_1),
        second=(input_ids_2,attention_mask_2)
      )

      loss = loss_fn(outputs, targets)

      losses.append(loss.item()/batch_size)

  return np.mean(losses)

In [None]:
import numpy as np

In [None]:
from collections import defaultdict

In [None]:
import numpy as np
from collections import defaultdict
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

In [None]:
from tqdm.notebook import tqdm

In [None]:
torch.cuda.empty_cache()

In [None]:
EPOCHS = 10
optimizer = AdamW(model_v2.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.MSELoss().to(device)

In [None]:
%%time

history = defaultdict(list)
best_loss = 1000000

for epoch in range(0, EPOCHS):
  epoch = int(epoch.item())
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_loss = train_epoch(
    model_v2,
    train_dataloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(train_dataset)
  )

  print(f'Train loss {train_loss}')

  val_loss = eval_model(
    model_v2,
    test_dataloader,
    loss_fn,
    device,
    len(test_dataset)
  )

  print(f'Val   loss {val_loss}')
  print()

  history['train_loss'].append(train_loss)
  history['val_loss'].append(val_loss)

  if val_loss < best_loss:
    torch.save(model_v2.state_dict(), 'best_model_state.bin')
    best_loss = val_loss
#12345678910)

Epoch 1/10
----------




  0%|          | 0/128 [00:00<?, ?it/s]



Train loss 0.005508909536729334


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0028089098145015713

Epoch 2/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.002591520044916251


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.001731139971863825

Epoch 3/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.0015960635560077208


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0013089381100144237

Epoch 4/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.0011663294746995234


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0011100917817689045

Epoch 5/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.000924303474221233


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0009597371119821747

Epoch 6/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.0007652425441619926


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0008574570779591192

Epoch 7/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.0006415386899334408


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0008173729103535067

Epoch 8/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.0005590516942675094


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0007531603539043084

Epoch 9/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.0005035643213204821


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0007230334913190238

Epoch 10/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.0004755906890068218


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.0006984700739849359

Epoch 11/10
----------


  0%|          | 0/128 [00:00<?, ?it/s]

Train loss 0.00044215814330073044


  0%|          | 0/43 [00:00<?, ?it/s]

Val   loss 0.00069328720361904

CPU times: user 43min 23s, sys: 12min 48s, total: 56min 12s
Wall time: 59min 1s


In [None]:
torch.save(model_v2, "/content/drive/MyDrive/mfc_bot/model_v1.pt")

In [None]:
!cp /content/best_model_state.bin /content/drive/MyDrive/mfc_bot

In [None]:
import json

In [None]:
with open("/content/drive/MyDrive/mfc_bot/model_v2_history.json", "w") as fp:
  json.dump(history, fp)

In [None]:
model_v2 = torch.load("/content/drive/MyDrive/mfc_bot/model_v2.pt", map_location=device)

In [None]:
bert = model_v2.bert1

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

sentences = []
for index, row in qa.iterrows():
  sentences.append(row["entailment"])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")

Downloading (…)okenizer_config.json:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
device = "cuda"

In [None]:
embedings = []
size = 128
cur = 0
for ind, sentence in enumerate(sentences):
  subs = sentences[cur:cur+size] if size < len(sentences) else sentences[cur:]
  encoded_input = tokenizer(subs, padding=True, truncation=True, max_length=512, return_tensors='pt')
  encoded_input = encoded_input.to(device)
  bert.to(device)
  with torch.no_grad():
    model_output = bert(**encoded_input)
  emb = mean_pooling(model_output, encoded_input['attention_mask'])
  embedings = embedings + emb.tolist()
  cur += size
  print(cur)
  if cur >= len(sentences):
    break

128
256
384
512
640
768
896
1024
1152
1280
1408
1536
1664
1792
1920
2048
2176
2304
2432
2560
2688
2816


In [None]:
import json

In [None]:
with open("/content/drive/MyDrive/mfc_bot/embedings.json", "w") as fp:
  text = json.dumps(embedings)

In [None]:
import pickle
with open("/content/drive/MyDrive/mfc_bot/sentences.pkl", "wb") as fp:
  pickle.dump(embedings, fp)
with open("/content/drive/MyDrive/mfc_bot/embedings.pkl", "rb") as fp:   # Unpickling
  test = pickle.load(fp)

In [None]:
type(test)


NameError: ignored

In [None]:
def top_3(source, base, sentences):
  top_1 = {"max":-10, "ind":None, "sentence":None}
  top_2 = {"max":-10, "ind":None, "sentence":None}
  top_3 = {"max":-10, "ind":None, "sentence":None}
  for ind, s in enumerate(source):
    cos_sim = cosine_similarity(s, base)
    if cos_sim > top_1["max"] and sentences[ind] != top_1["sentence"]:
      top_1["max"] = cos_sim
      top_1["ind"] = ind
      continue
    elif cos_sim > top_2["max"] and sentences[ind] != top_2["sentence"]:
      top_2["max"] = cos_sim
      top_2["ind"] = ind
      continue
    elif cos_sim > top_3["max"] and sentences[ind] != top_3["sentence"]:
      top_3["max"] = cos_sim
      top_3["ind"] = ind
      continue
  return top_1, top_2, top_3

In [None]:
def cosine_similarity(a, b):
  cos_sim = dot(a, b)/(norm(a)*norm(b))
  return cos_sim

In [None]:
q = qa.iloc[400]["ancor"]
q

'Какая сумма выплат по услуге "Предоставлять меру социальной поддержки в виде ежемесячного пособия на ребенка в возрасте от полутора до семи лет на приобретение товаров детского ассортимента и продуктов детского питания, специальных молочных продуктов"?'

In [None]:
q_enc = tokenizer([q], padding=True, truncation=True, max_length=512, return_tensors='pt')
q_enc = q_enc.to(device)
with torch.no_grad():
    model_output = bert(**q_enc)
q_emb = mean_pooling(model_output, q_enc['attention_mask'])

In [None]:
a = qa.iloc[400]["entailment"]
a

'Ежемесячное пособие на ребенка от 1,5 лет до 7 лет в 2023 году: - 1265 руб.; - 1826 руб. – на ребенка из неполной семьи, семьи военнослужащего.'

In [None]:
from numpy import dot
from numpy.linalg import norm

In [None]:
tops = top_3(embedings, q_emb[0].tolist(), sentences)

In [None]:
sentences[tops[2]["ind"]]

'Ежемесячное пособие на ребенка-инвалида с особыми потребностями в 2023 году - 19422 руб.\n'

In [None]:
!pip freeze > requirements.txt

#V.3

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.3 MB/s[0m eta [36m0:00:0

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import random
from sklearn.model_selection import train_test_split
def get_contradiction(source, entailment):
  result = entailment
  #ind = random.choice(source.index)
  while result == entailment:
    result = source.sample(n=1)["ANSWER"].item()
    #ind = random.randint(0, len(source))
  return result
def preprocess_data():
  base = pd.read_excel("/content/drive/MyDrive/mfc_bot/base.xlsx")
  #poor = pd.read_excel("/content/drive/MyDrive/mfc_bot/poor.xlsx")
  qa = pd.read_excel("/content/drive/MyDrive/mfc_bot/qa.xlsx")
  qa = qa[qa["ANSWER"].isna() == False]
  qa["ancor"] = qa['QUESTION'].apply(lambda x: x.strip())
  qa["entailment"] = qa["ANSWER"].apply(lambda x: x.strip())
  qa["contradiction"] = qa["entailment"].apply(lambda x: get_contradiction(qa, x))
  qa["neutral"] = qa["entailment"].apply(lambda x: "")
  qa = qa.drop(columns=["QUESTION", "ANSWER"])
  base["label_1"] = base["label_1"].apply(lambda x: list(base["label_1"].unique()).index(x))
  base["label_2"] = base["label_2"].apply(lambda x: list(base["label_2"].unique()).index(x))
  base["label_3"] = base["label_3"].apply(lambda x: list(base["label_3"].unique()).index(x))
  base = base.drop(columns=["Unnamed: 0", "Теги по услуге", "labels", "processed"])
  base = base[base["Ответ"].isna()==False]
  return qa, base

In [None]:
qa, base = preprocess_data()

##Подготовка данных

In [None]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   № услуги                         879 non-null    object
 1   Полное наименование услуги       881 non-null    object
 2   Сокращенное наименование услуги  881 non-null    object
 3   Вопрос                           881 non-null    object
 4   Ответ                            880 non-null    object
 5   label_1                          881 non-null    int64 
 6   label_2                          881 non-null    int64 
 7   label_3                          881 non-null    int64 
dtypes: int64(3), object(5)
memory usage: 55.2+ KB


In [None]:
import pandas as pd

In [None]:
base = pd.read_excel("/content/drive/MyDrive/mfc_bot/base.xlsx")

In [None]:
base["labels"] = base["Теги по услуге"]
base["processed"] = False

In [None]:
base[base["processed"] == False].groupby("Теги по услуге").count()

Unnamed: 0_level_0,№ услуги,Полное наименование услуги,Сокращенное наименование услуги,Вопрос,Ответ,labels,processed
Теги по услуге,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [None]:
def select(x):
  return str(x).lower().find("ребен") != -1 or str(x).lower().find("род") != -1

In [None]:
buf = base[base["processed"] == False][base["Теги по услуге"].apply(lambda x: select(x))]

  buf = base[base["processed"] == False][base["Теги по услуге"].apply(lambda x: select(x))]


In [None]:
unique = buf["Теги по услуге"].unique()
unique

array(['Рождение ребенка', 'Ребенок инвалид', 'Родитель с инвалидностью',
       'Родитель с инвалидностью, ребенок-инвалид', 'Ребенок-инвалид',
       'Ребенок с целиакия'], dtype=object)

In [None]:
def fill_labels(x, l):
  if x not in unique:
    return l
  if x == 'Рождение ребенка':
    return "выплаты/рождение ребенка/родители"
  elif x == 'Ребенок с целиакия':
    return "выплаты/целиакия/дети"
  else:
    return "выплаты/едв/инвалиды"

In [None]:
base["labels"] = base[["Теги по услуге", "labels",]].apply(lambda x: fill_labels(x["Теги по услуге"], x["labels"]), axis=1)

In [None]:
base["processed"] = base[["Теги по услуге", "processed"]].apply(lambda x: select(x["Теги по услуге"]) or x["processed"], axis=1)

In [None]:
base = base[base["labels"].notna()]

In [None]:
base["labels"] = base['labels'].apply(lambda x: "оформление/свидетельство многодетной семьи/многодетные семьи" if x == "оформление/многодетные семьи" else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base["labels"] = base['labels'].apply(lambda x: "оформление/свидетельство многодетной семьи/многодетные семьи" if x == "оформление/многодетные семьи" else x)


In [None]:
base["label_1"] = base["labels"].apply(lambda x: x.split("/")[0])
base["label_2"] = base["labels"].apply(lambda x: x.split("/")[1])
base["label_3"] = base["labels"].apply(lambda x: x.split("/")[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base["label_1"] = base["labels"].apply(lambda x: x.split("/")[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base["label_2"] = base["labels"].apply(lambda x: x.split("/")[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base["label_3"] = base["labels"].apply(lambda x: x.split("/")[2])


In [None]:
base["label_3"] = base["label_3"].apply(lambda x: 'многодетные семьи' if x == 'многодетные' else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base["label_3"] = base["label_3"].apply(lambda x: 'многодетные семьи' if x == 'многодетные' else x)


In [None]:
base["label_3"].unique()

array(['родители', 'другие', 'школьники', 'инвалиды', 'ребенок', 'дети',
       'многодетные семьи', 'сироты', 'студенческие семьи',
       'военнослужащие', 'жены военнослужащих', 'женщины', 'пенсионеры',
       'герои', 'малоимущие', 'кризисные'], dtype=object)

In [None]:
base.to_excel("/content/drive/MyDrive/mfc_bot/base.xlsx")

##Обучение

In [None]:
class MFCDataset(Dataset):
  SPLIT_RANDOM_SEED = 42
  TEST_SIZE = 0.25

  def __init__(self, qa, base, tokenizer, model_version=1, maxlen=512, train=True):
    super().__init__()
    self.qa = qa
    self.base = base
    self.dataset = []
    for index, row in qa.iterrows():
      ind = index
      item_pos = {}
      item_pos["first"] = row["ancor"]
      item_pos["second"] = row["entailment"]
      item_pos["label"] = 1
      item_neg = {}
      item_neg["first"] = row["ancor"]
      item_neg["second"] = row["contradiction"]
      item_neg["label"] = 0
      self.dataset.append(item_pos)
      self.dataset.append(item_neg)
    random.shuffle(self.dataset)
    self.tokenizer = tokenizer
    self.maxlen = maxlen
    self.model_version = model_version
    self.dataset_v3 = []
    for ind, row in base.iterrows():
      item_1 = {"sentence": row["Полное наименование услуги"], "label_1":row["label_1"], "label_2":row["label_2"], "label_3":row["label_3"]}
      item_2 = {"sentence": row["Ответ"], "label_1":row["label_1"], "label_2":row["label_2"], "label_3":row["label_3"]}
      self.dataset_v3.append(item_1)
      self.dataset_v3.append(item_2)
    if train:
      self.dataset = train_test_split(self.dataset, random_state=self.SPLIT_RANDOM_SEED, test_size=self.TEST_SIZE)[0]
      self.dataset_v3 = train_test_split(self.dataset_v3, random_state=self.SPLIT_RANDOM_SEED, test_size=self.TEST_SIZE)[0]
    else:
      self.dataset = train_test_split(self.dataset, random_state=self.SPLIT_RANDOM_SEED, test_size=self.TEST_SIZE)[1]
      self.dataset_v3 = train_test_split(self.dataset_v3, random_state=self.SPLIT_RANDOM_SEED, test_size=self.TEST_SIZE)[1]
  def __len__(self):
    if self.model_version == 3:
      return len(self.dataset_v3)
    return len(self.dataset)
  def __getitem__(self, item):
    if self.model_version == 1:
      first = self.dataset[item]["first"]
      first_toks = self.tokenizer([first], padding=True, truncation=True, max_length=int(self.maxlen/2), return_tensors='pt')
      second = self.dataset[item]["second"]
      second_toks = self.tokenizer([second], padding=True, truncation=True, max_length=int(self.maxlen/2-1), return_tensors='pt')
      result = {}
      common_len = len(first_toks['input_ids'][0].tolist() + second_toks['input_ids'][0].tolist()[1:])
      result['input_ids'] = first_toks['input_ids'][0].tolist() + second_toks['input_ids'][0].tolist()[1:] + [0]*(self.maxlen-common_len)
      result['token_type_ids'] = first_toks['token_type_ids'][0].tolist() + second_toks['token_type_ids'][0].tolist()[1:] + [0]*(self.maxlen-common_len)
      result['attention_mask'] = first_toks['attention_mask'][0].tolist() + second_toks['attention_mask'][0].tolist()[1:] + [0]*(self.maxlen-common_len)
      label = self.dataset[item]["label"]
      #return result, label
      return {
        'input_ids': torch.tensor(result['input_ids']).flatten(),
        'attention_mask': torch.tensor(result['attention_mask']).flatten(),
        'targets': torch.tensor(label, dtype=torch.long)
      }
    elif self.model_version == 2:
      first = self.dataset[item]["first"]
      first_toks = self.tokenizer([first], truncation=True, max_length=self.maxlen, pad_to_max_length=True,return_tensors='pt')
      second = self.dataset[item]["second"]
      second_toks = self.tokenizer([second], truncation=True, max_length=self.maxlen, pad_to_max_length=True,return_tensors='pt')
      label = self.dataset[item]["label"]
      return {
          "input_ids_1":first_toks["input_ids"].flatten(),
          "input_ids_2":second_toks["input_ids"].flatten(),
          "attention_mask_1":first_toks["attention_mask"].flatten(),
          "attention_mask_2":second_toks["attention_mask"].flatten(),
          "target":torch.tensor(label, dtype=torch.float)
      }
    elif self.model_version == 3:
      sentence = self.dataset_v3[item]["sentence"]
      toks = self.tokenizer([sentence], truncation=True, max_length=self.maxlen, pad_to_max_length=True,return_tensors='pt')
      return {
          "input_ids":toks["input_ids"].flatten(),
          "attention_mask":toks["attention_mask"].flatten(),
          "target_1":torch.tensor(self.dataset_v3[item]["label_1"], dtype=torch.long),
          "target_2":torch.tensor(self.dataset_v3[item]["label_2"], dtype=torch.long),
          "target_3":torch.tensor(self.dataset_v3[item]["label_3"], dtype=torch.long)
      }

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")

In [None]:
device = "cuda"
batch_size = 32
train_dataset = MFCDataset(qa, base, tokenizer, model_version=3, maxlen=128, train=True)
test_dataset = MFCDataset(qa, base, tokenizer,  model_version=3,maxlen=128, train=False)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
class SBERT_v2(nn.Module):
  def __init__(self, bert1):
    super(SBERT_v2, self).__init__()
    self.bert1 = bert1
    #self.bert2 = bert2
    self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)

  def forward(self, first, second):
    outputs_1 = self.bert1(
      input_ids=first[0],
      attention_mask=first[1]
    )
    outputs_2 = self.bert1(
      input_ids=second[0],
      attention_mask=second[1]
    )
    return self.cos(outputs_1["pooler_output"], outputs_2["pooler_output"])

In [None]:
model = torch.load("/content/drive/MyDrive/mfc_bot/model_v2.pt")

In [None]:
bert = model.bert1
model_v3 = SBERT_v3(bert, len(base["label_1"].unique()), len(base["label_2"].unique()), len(base["label_3"].unique()))
model_v3.to(device)

SBERT_v3(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elemen

In [None]:
class SBERT_v3(nn.Module):
  def __init__(self, bert, n_classes_1, n_classes_2, n_classes_3, ):
    super(SBERT_v3, self).__init__()
    self.bert = bert
    self.drop = nn.Dropout(p=0.3)
    self.out1 = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=self.bert.config.hidden_size, out_features=n_classes_1)
        )
    self.out2 = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=self.bert.config.hidden_size, out_features=n_classes_2)
        )
    self.out3 = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(in_features=self.bert.config.hidden_size, out_features=n_classes_3)
        )

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = {"out1":self.out1(outputs["pooler_output"]),
              "out2":self.out2(outputs["pooler_output"]),
              "out3":self.out3(outputs["pooler_output"])}
    return output

In [None]:
def train_epoch(
  model,
  data_loader,
  optimizer,
  device,
  scheduler,
  n_examples,
  loss_fn_1,
  loss_fn_2,
  loss_fn_3
):
  model = model.train()
  losses = []
  correct_predictions = 0

  for d in tqdm(data_loader):
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets_1 = d["target_1"].to(device)
    targets_2 = d["target_2"].to(device)
    targets_3 = d["target_3"].to(device)

    outputs = model(
      input_ids,
      attention_mask
    )
    loss_1 = loss_fn_1(outputs["out1"], targets_1)
    loss_2 = loss_fn_1(outputs["out2"], targets_1)
    loss_3 = loss_fn_1(outputs["out3"], targets_1)
    loss = loss_1 + loss_2 + loss_3

    out_targets_dict = {"out1":targets_1,"out2":targets_2,"out3":targets_3}
    for key in outputs.keys():
      _, preds = torch.max(outputs[key], dim=1)

      correct_predictions += torch.sum(preds == out_targets_dict[key])
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples/3, np.mean(losses)

In [None]:
def eval_model(model, data_loader, device, n_examples, loss_fn_1, loss_fn_2, loss_fn_3):
  model = model.eval()
  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in tqdm(data_loader):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets_1 = d["target_1"].to(device)
      targets_2 = d["target_2"].to(device)
      targets_3 = d["target_3"].to(device)

      outputs = model(
        input_ids,
        attention_mask
      )
      loss_1 = loss_fn_1(outputs["out1"], targets_1)
      loss_2 = loss_fn_1(outputs["out2"], targets_1)
      loss_3 = loss_fn_1(outputs["out3"], targets_1)
      loss = loss_1 + loss_2 + loss_3

      out_targets_dict = {"out1":targets_1,"out2":targets_2,"out3":targets_3}
      for key in outputs.keys():
        _, preds = torch.max(outputs[key], dim=1)

        correct_predictions += torch.sum(preds == out_targets_dict[key])
      losses.append(loss.item())

  return correct_predictions.double() / n_examples/3, np.mean(losses)

In [None]:
import numpy as np
from collections import defaultdict
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

In [None]:
EPOCHS = 5
optimizer = AdamW(model_v3.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn_1 = nn.CrossEntropyLoss().to(device)
loss_fn_2 = nn.CrossEntropyLoss().to(device)
loss_fn_3 = nn.CrossEntropyLoss().to(device)

In [None]:
%%time

history = defaultdict(list)
best_loss = 1000000

for epoch in range(0, EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model_v3,
    train_dataloader,
    optimizer,
    device,
    scheduler,
    len(train_dataset),
    loss_fn_1,
    loss_fn_2,
    loss_fn_3
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model_v3,
    test_dataloader,
    device,
    len(test_dataset),
    loss_fn_1,
    loss_fn_2,
    loss_fn_3
  )

  print(f'Val loss {val_loss} accuracy {val_acc}')
  print()

  history['train_loss'].append(train_loss)
  history['val_loss'].append(val_loss)

  if val_loss < best_loss:
    torch.save(model_v3.state_dict(), 'best_model_state.bin')
    best_loss = val_loss
#12

Epoch 1/5
----------


  0%|          | 0/42 [00:00<?, ?it/s]



Train loss 0.49566736657704624 accuracy 0.4419191919191919


  0%|          | 0/14 [00:00<?, ?it/s]

Val loss 1.1228699215820857 accuracy 0.41287878787878785



KeyboardInterrupt: ignored

In [None]:
torch.save(model_v3, "/content/drive/MyDrive/mfc_bot/model_v3.pt")

#TF

In [None]:
!pip3 install tensorflow



In [None]:
import re
import os
import sys
import json
import nltk

import logging
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import Callback

from scipy.stats import spearmanr, pearsonr
from glob import glob

nltk.download('punkt')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
