In [None]:
!pip install transformers

In [24]:
import torch
from tqdm import tqdm
from transformers import BertModel, BertTokenizer, AdamW
from model_script import BertForSequenceClassification
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler 

from torch.nn import CrossEntropyLoss
import numpy as np
from sklearn import metrics
from sklearn.metrics import matthews_corrcoef, f1_score, precision_recall_fscore_support

from model_script import convert_tsv_to_model_input, convert_list_to_torch

In [None]:
num_labels = 6
max_seq_length = 128
batch_size = 32
num_epoch = 3
lr = 3e-5
val_batch_size = 64

label_map = {'false': 0, 'CPR:3': 1, 'CPR:4': 2, 'CPR:5': 3, 'CPR:6': 4, 'CPR:9': 5}
reverse_map = {0:'false', 1:'CPR:3', 2:'CPR:4', 3:'CPR:5', 4:'CPR:6', 5:'CPR:9'}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

optimizer = AdamW(model.parameters(), lr=lr)
model.train()
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [15]:
def eval(dataloader,all_label_ids,rev_dict):
  eval_loss = 0
  preds = []

  model.eval()
  print('###  EVALUATION  ###')
  for step,batch in enumerate(tqdm(dataloader)):
    batch = tuple(t.to(device) for t in batch)

    title_ids, title_mask, title_segment, input_ids, input_mask, segment_ids, \
      P_gauss1_list, P_gauss2_list, label_ids = batch

    with torch.no_grad():
      logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, \
                    input_mask, P_gauss1_list, P_gauss2_list, labels=None)
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))

      eval_loss+=loss.item()
      for i in range(len(logs.detach().cpu().numpy())):
        preds.append(logs.detach().cpu().numpy()[i])
    
  eval_loss = eval_loss/len(dataloader)

  preds = np.array(preds)
  preds = preds.argmax(axis=1)

  s = precision_recall_fscore_support(y_pred=[reverse_map[i] for i in preds], 
                                      y_true=[reverse_map[i] for i in all_label_ids.numpy()], 
                                      labels=[reverse_map[i] for i in range(6)], average="micro")
 
  matr = metrics.confusion_matrix(all_label_ids.numpy(), preds,labels=[0,1,2,3,4,5])

  report = metrics.classification_report([reverse_map[i] for i in all_label_ids.numpy()], 
                                      [reverse_map[i] for i in preds],
                                      labels=[reverse_map[i] for i in range(6)],
                                      output_dict=True)


  return(s,matr,report,eval_loss)


In [25]:
model_input_list = convert_tsv_to_model_input('train_en.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
train_data = TensorDataset(*convert_list_to_torch(model_input_list)) # * for turple unboxing 
en_train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
# Выводит количество брака в процентном соотношении от обрабатываемого датасета

model_input_list = convert_tsv_to_model_input('train_ru.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
train_data = TensorDataset(*convert_list_to_torch(model_input_list)) # * for turple unboxing 
ru_train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

en_val_input_list = convert_tsv_to_model_input('dev_en.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
en_val_data = TensorDataset(*convert_list_to_torch(en_val_input_list)) # * for turple unboxing 
en_val_dataloader = DataLoader(en_val_data, sampler=SequentialSampler(en_val_data), batch_size=val_batch_size)

ru_val_input_list = convert_tsv_to_model_input('dev_ru.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
ru_val_data = TensorDataset(*convert_list_to_torch(ru_val_input_list)) # * for turple unboxing 
ru_val_dataloader = DataLoader(ru_val_data, sampler=SequentialSampler(ru_val_data), batch_size=val_batch_size)


0.03779729022503925
0.11069674933184163
0.029879740980573544
0.1303752931978108


In [26]:
en_train_iterator = iter(en_train_dataloader)
ru_train_iterator = iter(ru_train_dataloader)

In [31]:
print('### TRAINING  ###')
for i in range(num_epoch):
  print('### TRAINING '+'n_epoch: ' + str(i+1) +' ###')
  for step in tqdm(range(len(en_train_dataloader))):

    batch_en = tuple(t.to(device) for t in next(en_train_iterator))

    if step%2 == 0:
      optimizer.zero_grad()
    title_ids, title_mask, title_segment, input_ids, input_mask, \
      segment_ids, P_gauss1_list, P_gauss2_list, label_ids = batch_en

    logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, input_mask, P_gauss1_list, P_gauss2_list, labels=None)
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))
    loss.backward()
    optimizer.step()

    batch_ru = tuple(t.to(device) for t in next(ru_train_iterator))

    title_ids, title_mask, title_segment, input_ids, input_mask, \
      segment_ids, P_gauss1_list, P_gauss2_list, label_ids = batch_ru

    logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, input_mask, P_gauss1_list, P_gauss2_list, labels=None)
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))
    loss.backward()
    optimizer.step()
  
  
  _,_,_,l = eval(en_val_dataloader,convert_list_to_torch(en_val_input_list)[8],reverse_map)
  print('###  EN Loss: ',l)

  _,_,_,l = eval(ru_val_dataloader,convert_list_to_torch(ru_val_input_list)[8],reverse_map)
  print('###  RU Loss: ',l)

  model.train()


### TRAINING  ###
### TRAINING n_epoch: 0 ###


  0%|          | 1/518 [00:05<45:47,  5.31s/it]


KeyboardInterrupt: ignored

In [14]:
batch = next(train_dataloader)
batch = tuple(t.to(device) for t in batch)
# model.to(device)

title_ids, title_mask, title_segment, input_ids, \
input_mask, segment_ids, P_gauss1_list, P_gauss2_list, label_ids = batch

with torch.no_grad():
  logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, \
                 input_mask, P_gauss1_list, P_gauss2_list, labels=None)
  loss_fct = CrossEntropyLoss()
  loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))

  

In [66]:
# p,r,f,s = precision_recall_fscore_support(y_pred=preds, y_true=labels, 
                                          # labels=[0,1,2,3,4,5], average="micro")
# print({"P": p, "R": r, "F1": f,})


array([2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2])

In [None]:
# torch.save(model.state_dict(), 'model_path')

# model.load_state_dict(torch.load('model_path'))
# model.eval()