In [19]:
import re
def split_string(line, split_symbol):
    """
    :param line: a string need be split
    :param split_symbol: a string: split symbol
    :return:
    """
    return list(filter(None, line.split(split_symbol)))


def read_standard_file(data):
    """
    :param path:
    :return: sent_col, sent_label_col and label_col
    """
    sent_col, sent_label_col, final_label_col = [], [], []
    last_sentence = ""
    data = data.split('\n')
    for line in data:
        line = line.rstrip('\n')
        # "[[" denote the begin of sequence label.
        if line[:2] == "[[":
            label_col.append(line)

        else:
            
            if last_sentence != "":
                cur_sent, cur_sent_label = split_string(last_sentence, "\t")
                sent_col.append(cur_sent)
                sent_label_col.append(int(cur_sent_label))
                final_label_col.append(label_col)

            last_sentence = clear_string(line, replace_symbol={u'\u3000': u""})
            label_col = []


    return sent_col, sent_label_col, final_label_col

def clear_string(line, strip_symbol=None, replace_symbol=None):
    """
    :param line: a string
    :param strip_symbol:
    :param replace_symbol: a list of special symbol, need replace.
    :return:
    """
    if strip_symbol is not None:
        for sym in strip_symbol:
            line = line.strip(sym)

    if replace_symbol is not None:
        for sym in replace_symbol:
            line = line.replace(sym, "")

    return line

# Example data
data = """
The lens is n't too great either , but it only shows it 's faults when it 's pushed to it 's extremes . 	0
[[];[];[];[];[]]
The battery life is consistent with all of the Nikon DLSR models - it lasts forever . 	1
[[7&&all 8&&of 9&&the 10&&Nikon 11&&DLSR 12&&models];[7&&all 8&&of 9&&the 10&&Nikon 11&&DLSR 12&&models];[2&&battery 3&&life];[5&&consistent];[0]]
The lens is n't too great either , but it only shows it 's faults when it 's pushed to it 's extremes . 	0
[[];[];[];[];[]]
This camera is ready to go when I am . 	0
[[];[];[];[];[]]
It takes very decent pictures in low light basketball gyms without the flash on . 	0
[[];[];[];[];[]]
Besides this is digital . . 	0
[[];[];[];[];[]]
I finally found the Black body for $ 862 from my search on Shopping.com . 	0
[[];[];[];[];[]]
And sure it 's no Canon XL-2 video camera but with 30fps VGA ( 640x480 ) video with audio moving pictures are smooth . 	0
[[];[];[];[];[]]
Both iXUS 40 and 65 are Made in Japan . 	1
[[2&&iXUS 3&&40];[5&&65];[];[7&&Made 8&&in 9&&Japan];[0]]
In fact it is remarkably similar to manual cameras of old , and in my opinion that is a high complement . 	1
[[3&&it];[8&&manual 9&&cameras 10&&of 11&&old];[];[6&&similar];[0]]
The issues of redeye remain . 	0
[[];[];[];[];[]]
I chose this camera over even those models because it is a perfect fit for me and I think it is a perfect fit for anyone no matter what you are looking for in a camera . 	1
[[3&&this 4&&camera];[6&&even 7&&those 8&&models];[];[5&&over];[1]]
I knew what I wanted - a camera that took great ( clear , focused , lit well , properly white balanced ) photos in all lighting conditions , had a long lasting battery , image stabilization , had enough options that I could fine tune my photography to the extent that I knew how , and took decent videos . 	0
[[];[];[];[];[]]
Since I had a film SLR from years ago and then bought a Sony point-and-shoot , the Rebel was n't hard to learn . 	0
[[];[];[];[];[]]
In truth , the focus system is good : better than most of the systems that I 've seen . 	1
[[5&&focus 6&&system];[12&&most 13&&of 14&&the 15&&systems];[];[10&&better];[1]]
For self-photographers - A note on the wireless remote The wireless remote for the 5D is significantly more expensive and less portable than that of the 350D . 	1
[[11&&wireless 12&&remote 13&&for 14&&the 15&&5D];[24&&that 25&&of 26&&the 27&&350D];[];[21&&less 22&&portable];[-1]]
[[11&&wireless 12&&remote 13&&for 14&&the 15&&5D];[24&&that 25&&of 26&&the 27&&350D];[];[18&&more 19&&expensive];[-1]]
Other significant improvements vs. 20D , can be found in the tech sheet . 	1
[[];[5&&20D];[];[3&&improvements];[-1]]
The features in this camera are exceptional and the buttons are well placed . 	0
[[];[];[];[];[]]
"""

sent_col, sent_label_col, final_label_col = read_standard_file(data)

# Print the results
for sent, label, final_label in zip(sent_col, sent_label_col, final_label_col):
    print("Sentence:", sent)
    print("Label:", label)
    print("Final Label:", final_label)
    print()

Sentence: The lens is n't too great either , but it only shows it 's faults when it 's pushed to it 's extremes . 
Label: 0
Final Label: ['[[];[];[];[];[]]']

Sentence: The battery life is consistent with all of the Nikon DLSR models - it lasts forever . 
Label: 1
Final Label: ['[[7&&all 8&&of 9&&the 10&&Nikon 11&&DLSR 12&&models];[7&&all 8&&of 9&&the 10&&Nikon 11&&DLSR 12&&models];[2&&battery 3&&life];[5&&consistent];[0]]']

Sentence: The lens is n't too great either , but it only shows it 's faults when it 's pushed to it 's extremes . 
Label: 0
Final Label: ['[[];[];[];[];[]]']

Sentence: This camera is ready to go when I am . 
Label: 0
Final Label: ['[[];[];[];[];[]]']

Sentence: It takes very decent pictures in low light basketball gyms without the flash on . 
Label: 0
Final Label: ['[[];[];[];[];[]]']

Sentence: Besides this is digital . . 
Label: 0
Final Label: ['[[];[];[];[];[]]']

Sentence: I finally found the Black body for $ 862 from my search on Shopping.com . 
Label: 0
Fin

In [22]:
class LabelParser(object):
    def __init__(self, label_col, elem_col, intermittent=False):
        """
        :param label_col:
        :param elem_col: ["entity_1", "entity_2", "aspect", "result"]
        :param intermittent: True denote "result" using intermittent representation
        """
        self.label_col = label_col
        self.elem_col = elem_col
        self.intermittent = intermittent

    def parse_sequence_label(self, split_symbol="&", sent_col=None, file_type="vn"):
        """
        :param split_symbol:
        :param sent_col:
        :param file_type
        :return:
        """
        null_label = "[[];[];[];[];[]]"
        tuple_pair_col, elem_representation_col = [], []

        for index in range(len(self.label_col)):
            # For non-comparative sentences' label.
            if self.label_col[index][0] == null_label:
                tuple_pair_col.append([[(-1, -1)] * 5])
                elem_representation_col.append(self.init_label_representation())

            else:
                global_elem_col = self.init_label_representation()

                sequence_tuple_pair = []
                for pair_index in range(len(self.label_col[index])):
                    global_elem_col, cur_tuple_pair = self.parse_each_pair_label(
                        self.label_col[index][pair_index], global_elem_col, split_symbol, sent_col[index], file_type
                    )
                    sequence_tuple_pair.append(cur_tuple_pair)

                tuple_pair_col.append(sequence_tuple_pair)
                elem_representation_col.append(global_elem_col)

        return elem_representation_col, tuple_pair_col

    def parse_each_pair_label(self, sequence_label, global_elem_col, split_symbol, sent=None, file_type="vn"):
        """
        :param sequence_label:
        :param global_elem_col:
        :param split_symbol:
        :param sent:
        :param file_type:
        :return:
        """
        elem_representation = split_string(sequence_label[1:-1], ";")
        
        tuple_pair_representation, result_elem = [], []
        for elem_index, each_elem in enumerate(elem_representation):
        
            if elem_index == 3 and each_elem == "[]":
                print(elem_representation)
            if self.intermittent:
                seg_elem_col = split_string(each_elem[1: -1], " , ")

            else:
                seg_elem_col = [each_elem[1: -1]] if each_elem[1:-1] != "" else []
            elem_tuple = ()

            # not polarity
            if elem_index != len(elem_representation) - 1:
                
                for each_seg_elem in seg_elem_col:
                    
                    number_char_col = split_string(each_seg_elem, " ")

                    if file_type == "cn":
                        s_index = int(split_string(number_char_col[0], split_symbol)[0])
                        e_index = int(split_string(number_char_col[-1], split_symbol)[0]) + 1
                    else:
                        s_index = int(split_string(number_char_col[0], split_symbol)[0]) - 1
                        e_index = int(split_string(number_char_col[-1], split_symbol)[0])

                    elem_tuple += (s_index, e_index)

                    if self.elem_col[elem_index] == "result":
                        result_elem += [s_index, e_index]

                    # [check sentence and label position]
                    # if sent is not None:
                    #     cur_elem_str = self.get_sub_elem(number_char_col, split_symbol)
                    #
                    #     if cur_elem_str != sent[s_index: e_index]:
                    #         print("----------------------------")
                    #         print(cur_elem_str)
                    #         print(sent[s_index: e_index])
                    #         print(s_index, e_index)
                    #         print(number_char_col)
                    #         print("----------------------------")

            else:
                polarity = int(seg_elem_col[0])
                
                elem_tuple += (polarity, polarity)

                # 针对英文中可能存在空的情况
                if len(result_elem) == 0:
                    result_elem = [-1, -1]

                result_elem.append(polarity)

            elem_tuple = (-1, -1) if len(elem_tuple) == 0 else elem_tuple
            tuple_pair_representation.append(elem_tuple)

            if elem_index < 3 and elem_tuple != (-1, -1):
                global_elem_col[self.elem_col[elem_index]].add(elem_tuple)

        global_elem_col["result"].add(tuple(result_elem))

        return global_elem_col, tuple_pair_representation

    @staticmethod
    def get_sub_elem(number_char_col, split_symbol):
        """
        :param number_char_col:
        :param split_symbol:
        :return:
        """
        elem_str = ""
        for num_char in number_char_col:
            elem_str += split_string(num_char, split_symbol)[1]

        return elem_str

    def init_label_representation(self):
        return {key: set() for key in self.elem_col}
    
sent_col, sent_label_col, label_col = read_standard_file(data)
LP = LabelParser(label_col, ["entity_1", "entity_2", "aspect", "result"])
label_col, tuple_pair_col = LP.parse_sequence_label("&", sent_col)
print(tuple_pair_col)

[[[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(6, 12), (6, 12), (1, 3), (4, 5), (0, 0)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(1, 3), (4, 5), (-1, -1), (6, 9), (0, 0)]], [[(2, 3), (7, 11), (-1, -1), (5, 6), (0, 0)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(2, 4), (5, 8), (-1, -1), (4, 5), (1, 1)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]], [[(4, 6), (11, 15), (-1, -1), (9, 10), (1, 1)]], [[(10, 15), (23, 27), (-1, -1), (20, 22), (-1, -1)], [(10, 15), (23, 27), (-1, -1), (17, 19), (-1, -1)]], [[(-1, -1), (4, 5), (-1, -1), (2, 3), (-1, -1)]], [[(-1, -1), (-1, -1), (-1, -1), (-1, -1), (-1, -1)]]]


In [2]:
import torch
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained("vinai/phobert-base-v2")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

# INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
sentence = 'Chúng_tôi là những nghiên_cứu_viên .'  

input_ids = torch.tensor([tokenizer.encode(sentence)])

with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

print(features)

## With TensorFlow 2.0+:
# from transformers import TFAutoModel
# phobert = TFAutoModel.from_pretrained("vinai/phobert-base")

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 3.8363e-02,  7.0703e-01, -1.3202e-01,  ..., -9.7445e-02,
           2.5193e-01,  3.4828e-01],
         [ 2.1041e-01,  2.3984e-01,  9.1063e-03,  ..., -3.2403e-04,
          -1.7492e-01,  4.0128e-02],
         [ 2.3745e-01,  9.8406e-03, -1.6509e-01,  ..., -4.3379e-02,
          -7.5782e-02,  4.6839e-02],
         ...,
         [ 2.3041e-01,  3.7583e-01,  1.7601e-02,  ...,  7.8471e-02,
           1.8661e-01,  5.2052e-02],
         [-2.3021e-01,  5.0276e-01,  1.0913e-01,  ..., -7.3261e-02,
           1.4339e-01,  1.8320e-01],
         [ 1.8813e-01,  6.2870e-01, -2.4809e-01,  ..., -4.8114e-02,
           1.6404e-01,  4.7204e-01]]]), pooler_output=tensor([[ 1.7816e-01, -6.8690e-02, -6.7991e-02,  3.0687e-02,  1.2191e-01,
          2.0043e-02,  2.0911e-01,  9.5309e-02, -1.6093e-01,  1.4772e-01,
          1.2147e-02, -5.2535e-02, -8.0609e-02, -6.4411e-02,  1.3046e-01,
         -2.6795e-02, -8.2068e-02, -9.5695e-02,  1.393