In [None]:
!pip install transformers

In [2]:
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer, AdamW
from model_script import BertForSequenceClassification

from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler 

from torch.nn import CrossEntropyLoss

from sklearn import metrics
from sklearn.metrics import matthews_corrcoef, f1_score, precision_recall_fscore_support

from model_script import convert_tsv_to_model_input, convert_list_to_torch

In [None]:
num_labels = 6
max_seq_length = 128
batch_size = 32
num_epoch = 3
lr = 3e-5
val_batch_size = 64

label_map = {'false': 0, 'CPR:3': 1, 'CPR:4': 2, 'CPR:5': 3, 'CPR:6': 4, 'CPR:9': 5}
reverse_map = {0:'false', 1:'CPR:3', 2:'CPR:4', 3:'CPR:5', 4:'CPR:6', 5:'CPR:9'}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

optimizer = AdamW(model.parameters(), lr=lr)
model.train()
model.to(device)

In [5]:
def eval(dataloader,all_label_ids,rev_dict):
  eval_loss = 0
  preds = []

  model.eval()
  print('###  EVALUATION  ###')
  for step,batch in enumerate(tqdm(dataloader)):
    batch = tuple(t.to(device) for t in batch)

    title_ids, title_mask, title_segment, input_ids, input_mask, segment_ids, \
      P_gauss1_list, P_gauss2_list, label_ids = batch

    with torch.no_grad():
      logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, \
                    input_mask, P_gauss1_list, P_gauss2_list, labels=None)
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))

      eval_loss+=loss.item()
      for i in range(len(logs.detach().cpu().numpy())):
        preds.append(logs.detach().cpu().numpy()[i])
    
  eval_loss = eval_loss/len(dataloader)

  preds = np.array(preds)
  preds = preds.argmax(axis=1)

  s = precision_recall_fscore_support(y_pred=[reverse_map[i] for i in preds], 
                                      y_true=[reverse_map[i] for i in all_label_ids.numpy()], 
                                      labels=[reverse_map[i] for i in range(6)], average="micro")
 
  matr = metrics.confusion_matrix(all_label_ids.numpy(), preds,labels=[0,1,2,3,4,5])

  report = metrics.classification_report([reverse_map[i] for i in all_label_ids.numpy()], 
                                      [reverse_map[i] for i in preds],
                                      labels=[reverse_map[i] for i in range(6)],
                                      output_dict=True)


  return(s,matr,report,eval_loss)

In [6]:
model_input_list = convert_tsv_to_model_input('train_en.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
train_data = TensorDataset(*convert_list_to_torch(model_input_list)) # * for turple unboxing 
en_train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
# Выводит количество брака в процентном соотношении от обрабатываемого датасета

model_input_list = convert_tsv_to_model_input('train_ru.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
train_data = TensorDataset(*convert_list_to_torch(model_input_list)) # * for turple unboxing 
ru_train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

en_val_input_list = convert_tsv_to_model_input('dev_en.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
en_val_data = TensorDataset(*convert_list_to_torch(en_val_input_list)) # * for turple unboxing 
en_val_dataloader = DataLoader(en_val_data, sampler=SequentialSampler(en_val_data), batch_size=val_batch_size)

ru_val_input_list = convert_tsv_to_model_input('dev_ru.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
ru_val_data = TensorDataset(*convert_list_to_torch(ru_val_input_list)) # * for turple unboxing 
ru_val_dataloader = DataLoader(ru_val_data, sampler=SequentialSampler(ru_val_data), batch_size=val_batch_size)


0.03779729022503925
0.11069674933184163
0.029879740980573544
0.1303752931978108


In [8]:
print('### TRAINING  ###')
for i in range(num_epoch):
  print('### TRAINING '+'n_epoch: ' + str(i+1) +' ###')
  for step,batch_en in tqdm(enumerate(en_train_dataloader)):
    batch_en = tuple(t.to(device) for t in batch_en)

    if step%2 == 0:
      optimizer.zero_grad()
    title_ids, title_mask, title_segment, input_ids, input_mask, \
      segment_ids, P_gauss1_list, P_gauss2_list, label_ids = batch_en

    logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, input_mask, P_gauss1_list, P_gauss2_list, labels=None)
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))
    loss.backward()
    optimizer.step()

  if i != num_epoch-1:
    _,_,_,l = eval(en_val_dataloader,convert_list_to_torch(en_val_input_list)[8],reverse_map)
    print('###  EN Loss: ',l)
  model.train()

  for step,batch_ru in tqdm(enumerate(ru_train_dataloader)):
    batch_ru = tuple(t.to(device) for t in batch_ru)

    if step%2 == 0:
      optimizer.zero_grad()
    title_ids, title_mask, title_segment, input_ids, input_mask, \
      segment_ids, P_gauss1_list, P_gauss2_list, label_ids = batch_ru

    logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, input_mask, P_gauss1_list, P_gauss2_list, labels=None)
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))
    loss.backward()
    optimizer.step()
  if i != num_epoch-1:
    _,_,_,l = eval(ru_val_dataloader,convert_list_to_torch(ru_val_input_list)[8],reverse_map)
    print('###  RU Loss: ',l)
  model.train()

### TRAINING  ###
### TRAINING n_epoch: 1 ###


518it [12:37,  1.46s/it]


###  EVALUATION  ###


100%|██████████| 164/164 [02:45<00:00,  1.01s/it]


###  EN Loss:  0.40857985587336304


448it [10:59,  1.47s/it]


###  EVALUATION  ###


100%|██████████| 140/140 [02:20<00:00,  1.01s/it]


###  RU Loss:  0.59231960943767
### TRAINING n_epoch: 2 ###


518it [12:42,  1.47s/it]


###  EVALUATION  ###


100%|██████████| 164/164 [02:46<00:00,  1.01s/it]


###  EN Loss:  0.368140086324363


448it [10:59,  1.47s/it]


###  EVALUATION  ###


100%|██████████| 140/140 [02:20<00:00,  1.00s/it]


###  RU Loss:  0.5045649231411516
### TRAINING n_epoch: 3 ###


518it [12:42,  1.47s/it]


###  EVALUATION  ###


100%|██████████| 164/164 [02:46<00:00,  1.01s/it]


###  EN Loss:  0.3524217285902989


448it [10:59,  1.47s/it]


###  EVALUATION  ###


100%|██████████| 140/140 [02:21<00:00,  1.01s/it]

###  RU Loss:  0.5266786667252226





In [None]:
model.eval()
torch.save(model.state_dict(), 'mBert_fine_tuned.pth')
s,matr,report,eval_loss = eval(ru_val_dataloader,convert_list_to_torch(ru_val_input_list)[8],reverse_map)

In [None]:
from google.colab import files
files.download("mBert_fine_tuned.pth")

In [None]:
batch = next(train_dataloader)
batch = tuple(t.to(device) for t in batch)
# model.to(device)

title_ids, title_mask, title_segment, input_ids, \
input_mask, segment_ids, P_gauss1_list, P_gauss2_list, label_ids = batch

with torch.no_grad():
  logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, \
                 input_mask, P_gauss1_list, P_gauss2_list, labels=None)
  loss_fct = CrossEntropyLoss()
  loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))

  

In [None]:
# p,r,f,s = precision_recall_fscore_support(y_pred=preds, y_true=labels, 
                                          # labels=[0,1,2,3,4,5], average="micro")
# print({"P": p, "R": r, "F1": f,})


array([2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2])

In [None]:
# torch.save(model.state_dict(), 'model_path')

# model.load_state_dict(torch.load('model_path'))
# model.eval()