In [1]:
!pip install transformers
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

In [3]:
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer, AdamW
from model_script import BertForSequenceClassification

from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler 

from torch.nn import CrossEntropyLoss

from sklearn import metrics
from sklearn.metrics import matthews_corrcoef, f1_score, precision_recall_fscore_support

from model_script import convert_tsv_to_model_input, convert_list_to_torch

In [4]:
num_labels = 6
max_seq_length = 128
batch_size = 32
num_epoch = 3
lr = 3e-5
val_batch_size = 64

label_map = {'false': 0, 'CPR:3': 1, 'CPR:4': 2, 'CPR:5': 3, 'CPR:6': 4, 'CPR:9': 5}
reverse_map = {0:'false', 1:'CPR:3', 2:'CPR:4', 3:'CPR:5', 4:'CPR:6', 5:'CPR:9'}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

optimizer = AdamW(model.parameters(), lr=lr)
model.train()
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [6]:
def eval(model,dataloader,all_label_ids,rev_dict):
  eval_loss = 0
  preds = []

  model.eval()
  print('###  EVALUATION  ###')
  for step,batch in enumerate(tqdm(dataloader)):
    batch = tuple(t.to(device) for t in batch)

    title_ids, title_mask, title_segment, input_ids, input_mask, segment_ids, \
      P_gauss1_list, P_gauss2_list, label_ids = batch

    with torch.no_grad():
      logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, \
                    input_mask, P_gauss1_list, P_gauss2_list, labels=None)
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))

      eval_loss+=loss.item()
      for i in range(len(logs.detach().cpu().numpy())):
        preds.append(logs.detach().cpu().numpy()[i])
    
  eval_loss = eval_loss/len(dataloader)

  preds = np.array(preds)
  preds = preds.argmax(axis=1)

  s = precision_recall_fscore_support(y_pred=[reverse_map[i] for i in preds], 
                                      y_true=[reverse_map[i] for i in all_label_ids.numpy()], 
                                      labels=[reverse_map[i] for i in range(6)], average="micro")
 
  matr = metrics.confusion_matrix(all_label_ids.numpy(), preds,labels=[0,1,2,3,4,5])

  report = metrics.classification_report([reverse_map[i] for i in all_label_ids.numpy()], 
                                      [reverse_map[i] for i in preds],
                                      labels=[reverse_map[i] for i in range(6)],
                                      output_dict=True)


  return(s,matr,report,eval_loss)

In [7]:
model_input_list = convert_tsv_to_model_input('train_en.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
train_data = TensorDataset(*convert_list_to_torch(model_input_list)) # * for turple unboxing 
en_train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
# Выводит количество брака в процентном соотношении от обрабатываемого датасета

model_input_list = convert_tsv_to_model_input('train_ru.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
train_data = TensorDataset(*convert_list_to_torch(model_input_list)) # * for turple unboxing 
ru_train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

en_val_input_list = convert_tsv_to_model_input('dev_en.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
en_val_data = TensorDataset(*convert_list_to_torch(en_val_input_list)) # * for turple unboxing 
en_val_dataloader = DataLoader(en_val_data, sampler=SequentialSampler(en_val_data), batch_size=val_batch_size)

ru_val_input_list = convert_tsv_to_model_input('dev_ru.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
ru_val_data = TensorDataset(*convert_list_to_torch(ru_val_input_list)) # * for turple unboxing 
ru_val_dataloader = DataLoader(ru_val_data, sampler=SequentialSampler(ru_val_data), batch_size=val_batch_size)


0.03779729022503925
0.11069674933184163
0.029879740980573544
0.1303752931978108


In [8]:
model.train()
print('### TRAINING  ###')
for i in range(num_epoch):
  print('### TRAINING EN '+'n_epoch: ' + str(i+1) +' ###')
  for step,batch_en in tqdm(enumerate(en_train_dataloader)):
    batch_en = tuple(t.to(device) for t in batch_en)

    if step%2==0:
      optimizer.zero_grad()
    title_ids, title_mask, title_segment, input_ids, input_mask, \
      segment_ids, P_gauss1_list, P_gauss2_list, label_ids = batch_en

    logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, input_mask, P_gauss1_list, P_gauss2_list, labels=None)
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))
    loss.backward()
    optimizer.step()

  _,_,_,l = eval(model,en_val_dataloader,convert_list_to_torch(en_val_input_list)[8],reverse_map)
  print('###  EN Loss: ',l)
  model.train()


  print('### TRAINING RU '+'n_epoch: ' + str(i+1) +' ###')
  for step,batch_ru in tqdm(enumerate(ru_train_dataloader)):
    batch_ru = tuple(t.to(device) for t in batch_ru)

    if step%2 == 0:
      optimizer.zero_grad()
    title_ids, title_mask, title_segment, input_ids, input_mask, \
      segment_ids, P_gauss1_list, P_gauss2_list, label_ids = batch_ru

    logs = model(title_ids, title_segment, title_mask, input_ids, segment_ids, input_mask, P_gauss1_list, P_gauss2_list, labels=None)
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(logs.view(-1, num_labels), label_ids.view(-1))
    loss.backward()
    optimizer.step()
  
  _,_,_,l = eval(model,ru_val_dataloader,convert_list_to_torch(ru_val_input_list)[8],reverse_map)
  print('###  RU Loss: ',l)
  model.train()

### TRAINING  ###
### TRAINING EN n_epoch: 1 ###


518it [12:41,  1.47s/it]


###  EVALUATION  ###


100%|██████████| 164/164 [02:47<00:00,  1.02s/it]


###  EN Loss:  0.4012303423908789
### TRAINING RU n_epoch: 1 ###


448it [11:00,  1.47s/it]


###  EVALUATION  ###


100%|██████████| 140/140 [02:20<00:00,  1.01s/it]


###  RU Loss:  0.4732041056134871
### TRAINING EN n_epoch: 2 ###


518it [12:43,  1.47s/it]


###  EVALUATION  ###


100%|██████████| 164/164 [02:47<00:00,  1.02s/it]


###  EN Loss:  0.3665001688584708
### TRAINING RU n_epoch: 2 ###


448it [11:01,  1.48s/it]


###  EVALUATION  ###


100%|██████████| 140/140 [02:20<00:00,  1.01s/it]


###  RU Loss:  0.470027253563915
### TRAINING EN n_epoch: 3 ###


518it [12:45,  1.48s/it]


###  EVALUATION  ###


100%|██████████| 164/164 [02:47<00:00,  1.02s/it]


###  EN Loss:  0.4026299099148274
### TRAINING RU n_epoch: 3 ###


448it [11:01,  1.48s/it]


###  EVALUATION  ###


100%|██████████| 140/140 [02:21<00:00,  1.01s/it]

###  RU Loss:  0.5104897882656327





In [10]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [11]:
model.eval()
model.push_to_hub('mBert-relation-extraction-FT')

Cloning https://huggingface.co/Maklygin/mBert-relation-extraction-FT into local empty directory.
Adding files tracked by Git LFS: ['pytorch_model.bin']. This may take a bit of time if the files are large.


Upload file pytorch_model.bin:   0%|          | 3.34k/639M [00:00<?, ?B/s]

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/Maklygin/mBert-relation-extraction-FT
   93aebdb..63e8e26  main -> main



'https://huggingface.co/Maklygin/mBert-relation-extraction-FT/commit/63e8e267f4fbcbc4c5c800e33880aac64a3b2807'

In [12]:
en_test_input_list = convert_tsv_to_model_input('test_en.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
en_test_data = TensorDataset(*convert_list_to_torch(en_test_input_list)) # * for turple unboxing 
en_test_dataloader = DataLoader(en_test_data, sampler=SequentialSampler(en_test_data), batch_size=val_batch_size)

0.052379686253655945


In [14]:
en_out = eval(model,en_test_dataloader,convert_list_to_torch(en_test_input_list)[8],reverse_map)

###  EVALUATION  ###


100%|██████████| 223/223 [03:30<00:00,  1.06it/s]


In [16]:
en_out

((0.8770342312008979, 0.8770342312008979, 0.8770342312008979, None),
 array([[10498,   106,   327,    42,    34,   133],
        [  242,   327,    32,     4,     0,     0],
        [  354,    25,  1107,     1,     7,     2],
        [   51,     1,     0,   106,     3,     1],
        [   56,     0,     1,     1,   174,     0],
        [  325,     0,     5,     0,     0,   291]]),
 {'CPR:3': {'f1-score': 0.6146616541353384,
   'precision': 0.7124183006535948,
   'recall': 0.540495867768595,
   'support': 605},
  'CPR:4': {'f1-score': 0.7459568733153639,
   'precision': 0.7520380434782609,
   'recall': 0.7399732620320856,
   'support': 1496},
  'CPR:5': {'f1-score': 0.6708860759493671,
   'precision': 0.6883116883116883,
   'recall': 0.654320987654321,
   'support': 162},
  'CPR:6': {'f1-score': 0.7733333333333334,
   'precision': 0.7981651376146789,
   'recall': 0.75,
   'support': 232},
  'CPR:9': {'f1-score': 0.5553435114503817,
   'precision': 0.6814988290398126,
   'recall': 0.46859

In [28]:
pd.read_csv('test_ru.tsv',skiprows=[0,3231],on_bad_lines='skip').to_csv('test_ru_fix.tsv',index=False)

In [37]:
df_val = pd.read_csv('dev_ru.tsv')
df_test = pd.read_csv('test_ru_fix.tsv',names=['index','title','sentense','label'])
pd.concat([df_val,df_test],ignore_index=True).to_csv('test_ru_plus_val.tsv',index=False)

In [38]:
ru_test_input_list = convert_tsv_to_model_input('test_ru_plus_val.tsv', tokenizer=tokenizer, max_seq_length=max_seq_length)
ru_test_data = TensorDataset(*convert_list_to_torch(ru_test_input_list)) # * for turple unboxing 
ru_test_dataloader = DataLoader(ru_test_data, sampler=SequentialSampler(ru_test_data), batch_size=val_batch_size)

0.12576140246620116


In [39]:
ru_out = eval(model,ru_test_dataloader,convert_list_to_torch(ru_test_input_list)[8],reverse_map)

###  EVALUATION  ###


100%|██████████| 184/184 [02:51<00:00,  1.07it/s]


In [40]:
ru_out

((0.8356699804571331, 0.8356699804571331, 0.8356699804571331, None),
 array([[8173,  109,  388,   39,   67,  111],
        [ 259,  273,   51,    2,    0,    1],
        [ 389,   31,  929,    0,    4,    2],
        [  61,    0,    3,   78,    3,    0],
        [  63,    0,    2,    0,  162,    0],
        [ 341,    1,    7,    0,    0,  220]]),
 {'CPR:3': {'f1-score': 0.546,
   'precision': 0.6594202898550725,
   'recall': 0.4658703071672355,
   'support': 586},
  'CPR:4': {'f1-score': 0.6793418647166362,
   'precision': 0.6731884057971015,
   'recall': 0.6856088560885609,
   'support': 1355},
  'CPR:5': {'f1-score': 0.5909090909090908,
   'precision': 0.6554621848739496,
   'recall': 0.5379310344827586,
   'support': 145},
  'CPR:6': {'f1-score': 0.6997840172786177,
   'precision': 0.6864406779661016,
   'recall': 0.7136563876651982,
   'support': 227},
  'CPR:9': {'f1-score': 0.4872646733111849,
   'precision': 0.6586826347305389,
   'recall': 0.3866432337434095,
   'support': 569},
