In [1]:
from utils import *
from transformers import AutoTokenizer
from transformers import BertForTokenClassification, AdamW, BertModel, BertConfig
import torch
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm, trange
from visual_test import *
from ner_evaluate import *



# 1. Load model

In [2]:
#Create device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [3]:
IO_tag_values = ['PER','LOC','ORG','MISC','O']

In [4]:
#BERT base
IO_model = BertForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(IO_tag_values)+1,
    output_attentions = False,
    output_hidden_states = False
)
IO_model.load_state_dict(torch.load('IO_BERT_MULTI.pt'), strict=False)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

<All keys matched successfully>

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False,use_fast=False)

In [6]:
IO_model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

# 2. Predict

In [7]:

text = """
Ch√∫c anh v√† bi·ªát ƒë·ªôi lu√¥n c√≥ th·∫≠t nhi·ªÅu s·ª©c kh·ªèe.üòö
M√£i sau d·ªãch m·ªõi ƒë∆∞·ª£c xem nh·ªØng k·ª≥ √°n c·ªßa anh üòé
M√† ch·ªã Tr√¢m Anh ƒë√¢u r·ªìi ·∫°? M·∫•y t·∫≠p kia e ko th·∫•y ..
"""
visualize(predict_text(IO_model, tokenizer, IO_tag_values ,text))

In [8]:
text = """
"H√†nh vi c·ªßa ch·ªã ƒë√£ b·ªã t√¥i ph√°t hi·ªán"t·∫∑c
Vinh L·ªëc Xo√°yü§£
"""
visualize(predict_text(IO_model, tokenizer, IO_tag_values ,text))

In [9]:
text = """
Duy T√¢n L√™ Ng·ªçc L·ª£i oke anh. Nh·ªõ cho em c√°i v·ªã tr√≠ n√†o ch·∫°y nhi·ªÅu nha üôÇ Em kh√¥ng l√†m th·ªß m√¥n ƒë√¢u :v
"""
visualize(predict_text(IO_model, tokenizer, IO_tag_values ,text)) 

In [10]:
text = """
Ken kh√¥ng ·ªû Trong Team 4 ng∆∞·ªùi n·ªØa √† anh : ANH VINH , TR√ÇM ANH , TLOO , KEN ƒê√≥
"""
visualize(predict_text(IO_model, tokenizer, IO_tag_values ,text)) 

In [11]:
text = '''
Gi√° th·ª±c ph·∫©m t∆∞∆°i s·ªëng, s·ªØa, g·∫°o, d·∫ßu ƒÉn tƒÉng 10-30%, d·ª± b√°o tƒÉng ti·∫øp tr∆∞·ªõc s·ª©c √©p c·ªßa nguy√™n li·ªáu ƒë·∫ßu v√†o, chi ph√≠ v·∫≠n chuy·ªÉn l√™n cao.

Ch·ªã Loan, c√¥ng nh√¢n m·ªôt c√¥ng ty may m·∫∑c ·ªü qu·∫≠n B√¨nh T√¢n (TP HCM) cho bi·∫øt, th√°ng 10, gia ƒë√¨nh ch·ªã chi ti√™u tƒÉng 20% so v·ªõi khi ch∆∞a c√≥ d·ªãch v√† tƒÉng 10% so v·ªõi th·ªùi ƒëi·ªÉm b√πng d·ªãch.

"N·∫øu tr∆∞·ªõc d·ªãch, m·ªôt b√¨nh gas 12 kg ch·ªâ 340.000 ƒë·ªìng, nay tƒÉng l√™n 500.000 ƒë·ªìng. Ti·ªÅn xƒÉng xe tƒÉng th√™m 100.000 ƒë·ªìng m·ªôt th√°ng. Gi√° c√°c m·∫∑t h√†ng nh∆∞ s·ªØa, g·∫°o, th·ª±c ph·∫©m c≈©ng tƒÉng cao khi·∫øn chi ph√≠ ƒëi ch·ª£ m·ªói tu·∫ßn tƒÉng g·∫ßn 500.000 ƒë·ªìng....", ch·ªã Loan t√≠nh to√°n v√† cho r·∫±ng v·ªõi t√¨nh h√¨nh n√†y, gia ƒë√¨nh ch·ªã nƒÉm nay l√†m kh√¥ng c√≥ d∆∞.

Ch·ªã H·∫±ng ·ªü qu·∫≠n G√≤ V·∫•p c≈©ng kh√° lo l·∫Øng khi s·ªØa b·ªôt cho em b√© ƒëang tƒÉng cao so v·ªõi tr∆∞·ªõc ƒë√¢y. H·∫ßu h·∫øt c√°c lo·∫°i s·ªØa nh·∫≠p ƒë·ªÅu tƒÉng 10.000-15.000 ƒë·ªìng m·ªôt h·ªôp. M·ªói th√°ng con ch·ªã u·ªëng 3 h·ªôp, chi ph√≠ tƒÉng th√™m 45.000 ƒë·ªìng. "Th·ªãt c√°, rau c·ªß c≈©ng tƒÉng r·∫•t m·∫°nh 10-20% so v·ªõi tr∆∞·ªõc ƒë√¢y", ch·ªã n√≥i.
    '''
visualize(predict_text(IO_model, tokenizer, IO_tag_values ,text)) 

In [12]:
test_visualize()