In [2]:
import os
from typing import List, Dict
import numpy as np

In [3]:
def create_bies_format(line):
    '''
    Method that create the corrispettive bies format of a string
    @param line = the input string
    @return the bias format of the input line
    '''
    i = 0
    bies = ""
    line = line.replace('\n',"")
    for index in range(len(line)):
        if(index < len(line)-1):
            if(line[index] == ' '):
                i = 0
            elif(i == 0 and line[index+1] == ' '):
                i = 0
                bies+='s'
            elif(i == 0 and line[index+1] != ' '):
                bies+='b'
                i+=1
            elif(i > 0 and line[index+1] != ' '):
                bies+='i'
                i+=1
            else :
                bies+='e'
                i=0
        else:
            if(i==0):
                bies+='s'
            elif(i>0):
                bies+='e'
    return bies


In [4]:
def create_bies_lines(path):
    '''
    method used to create an array that contains the bies format lines
    of a specific file passed through the path
    @param path = the pathfile of the file to convert
    @return an array with bies lines
    '''
    bies_lines = []
    with open(path,'r',encoding='utf8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            bies = create_bies_format(line)
            bies_lines.append(bies)
        f.close()
    return bies_lines

In [5]:
def delete_spaces_file(path):
    '''
    Method used to create an array that contains the lines without spaces
    @param path input file
    @return an array with sentences without space
    '''
    result = []
    with open(path,'r',encoding='utf8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            line = line.replace(" ","")
            result.append(line)
        f.close()
    return result

In [6]:
def create_file(path,bies_lines):
    '''
    Method used to create a file either for label and tensorInput 
    @param path = the path where to save the file
    @param bies_lines = the array containing the bies format sentences
    '''
    with open(path,'w') as f:
        for bies in bies_lines:
            f.write(bies+'\n')
        f.close()

In [7]:
#DEFINISCO I PATH DEI FILES
path_training_input = "../resources/icwb2-data/training/msr_training.utf8"
path_training_label = "../resources/icwb2-data/training/bies_msr_training.utf8"
path_training_tensor = "../resources/icwb2-data/training/tensor_msr_training.utf8"


#CREO GLI ARRAY DA INSERIRE NEI FILES DA CREARE
bies_result = create_bies_lines(path_training_input)
tensor_result = delete_spaces_file(path_training_input)

#CREO I RISPETTIVI FILES
create_file(path_training_label,bies_result)
create_file(path_training_tensor,tensor_result)

In [8]:
def split_into_ngrams(sentence: str, n : int):
    """
    Split a sentence in array of ngrams
    :param sentence Sentence as str
    :return an array of ngrams
    """
    ngrams = []
    for i in range(len(sentence)-(n-1)):
        ngram = sentence[i:i+n]
        ngrams.append(ngram)
    return ngrams
print(split_into_ngrams(bies_result[0],1))

['s', 'b', 'e', 's', 's', 'b', 'e', 's', 's', 's', 'b', 'i', 'e', 's', 's', 's', 's', 's', 's', 'b', 'e', 's', 's', 'b', 'i', 'i', 'e', 's', 'b', 'i', 'e', 's', 's', 'b', 'e', 's', 'b', 'i', 'i', 'e', 's', 's', 's', 's', 'b', 'e', 's', 's']


In [9]:
def make_vocab(sentences,n):
    '''
    :param sentences List of sentences used to build the vocab
    :return vocab Dictionary that has as key the ngram and as a value the index
    '''
    vocab = {0:0,"UNK": 1}
    for sentence in sentences:
        bigrams = split_into_ngrams(sentence,n)
        for bigram in bigrams:
            if bigram not in vocab:
                vocab[bigram] = len(vocab)
    return vocab

In [10]:

#CREATE THE UNIGRAMS VOCAB FOR TENSOR 
tensor_unigrams = make_vocab(tensor_result,1)
#CREATE THE BIGRAMS VOCAB BOTH FOR TENSOR AND BIES
tensor_bigrams = make_vocab(tensor_result,2)
bies_bigrams = make_vocab(bies_result,2)
bies_unigrams = make_vocab(bies_result,1)

In [11]:

print("Unigrammi: ", list(tensor_unigrams.items())[:10])
print("lunghezza unigrammi: ", len(tensor_unigrams))
print("Bigrammi: ", list(tensor_bigrams.items())[:10])
print("lunghezza bigrammi: ", len(tensor_bigrams))
print(tensor_bigrams['人们'])
print("bies unigrammi: ",bies_unigrams.items())

Unigrammi:  [(0, 0), ('UNK', 1), ('“', 2), ('人', 3), ('们', 4), ('常', 5), ('说', 6), ('生', 7), ('活', 8), ('是', 9)]
lunghezza unigrammi:  5169
Bigrammi:  [(0, 0), ('UNK', 1), ('“人', 2), ('人们', 3), ('们常', 4), ('常说', 5), ('说生', 6), ('生活', 7), ('活是', 8), ('是一', 9)]
lunghezza bigrammi:  426612
3
bies unigrammi:  dict_items([(0, 0), ('UNK', 1), ('s', 2), ('b', 3), ('e', 4), ('i', 5)])


In [12]:
def create_features_vector(sentences,unigram_vocab,bigram_vocab):
    '''Method that create the feature vectors to pass in the LSTM input
        @param sentences the array of chinese phrases
        @param unigram_vocab the vocab used for the chinese unigrams
        @param bigram_vocab the vocab used for the chinese bigrams
        @return the vector result
    '''
    features_vector = []
    for sentence in sentences:
        vec_sentence = []
        uni = split_into_ngrams(sentence,1)
        bi = split_into_ngrams(sentence,2)
        for i in range(len(uni)):
            vec_feature = []
            vec_feature.append(unigram_vocab[uni[i]])
            #insert 0 as bigram of last character position
            vec_feature.append(0) if i == (len(uni)-1) else vec_feature.append(bigram_vocab[bi[i]])
            #vec_feature.append(bigram_vocab[bi[i]])
            vec_sentence.append(vec_feature)
        features_vector.append(vec_sentence)
    return features_vector
            

In [13]:
uni = split_into_ngrams(tensor_result[0],1)
bi = split_into_ngrams(tensor_result[0],2)
print("unigrammi: ",uni,len(uni))
print("bigrammi: ",bi,len(bi))

unigrammi:  ['“', '人', '们', '常', '说', '生', '活', '是', '一', '部', '教', '科', '书', '，', '而', '血', '与', '火', '的', '战', '争', '更', '是', '不', '可', '多', '得', '的', '教', '科', '书', '，', '她', '确', '实', '是', '名', '副', '其', '实', '的', '‘', '我', '的', '大', '学', '’', '。'] 48
bigrammi:  ['“人', '人们', '们常', '常说', '说生', '生活', '活是', '是一', '一部', '部教', '教科', '科书', '书，', '，而', '而血', '血与', '与火', '火的', '的战', '战争', '争更', '更是', '是不', '不可', '可多', '多得', '得的', '的教', '教科', '科书', '书，', '，她', '她确', '确实', '实是', '是名', '名副', '副其', '其实', '实的', '的‘', '‘我', '我的', '的大', '大学', '学’', '’。'] 47


In [14]:
def create_label_vector(labels,bies_vocab):
    '''Method that create the label vectors to pass as LSTM output
        @param labels the array of bies label
        @param bies_vocab vocab with unigram label
        @return the vector result
    '''
    labels_vector = []
    for lab in labels:
        lab_sentence = []
        uni = split_into_ngrams(lab,1)
        for i in uni:
            lab_vec = []
            lab_vec.append(bies_vocab[i])
            lab_sentence.append(lab_vec)
        labels_vector.append(lab_sentence)
    return labels_vector
        

In [15]:
features_vector = create_features_vector(tensor_result,tensor_unigrams,tensor_bigrams)

In [16]:
label_vector = create_label_vector(bies_result,bies_unigrams)

In [17]:
print("Prima frase X-->",features_vector[1023],len(features_vector[1023]))
print("Prima frase Y-->",label_vector[1023],len(label_vector[1023]))


Prima frase X--> [[327, 15546], [410, 15547], [905, 15548], [1938, 15549], [313, 15550], [1080, 10262], [1605, 10263], [113, 1562], [77, 15551], [701, 3531], [573, 15552], [701, 1879], [227, 15553], [310, 1707], [652, 0]] 15
Prima frase Y--> [[3], [5], [4], [3], [4], [3], [4], [3], [4], [3], [5], [5], [4], [3], [4]] 15
[]
66
66
sbiebebebiiiesbebebiiiiiiebesbiiiiebiiiebebesbebessbiesbebebebebes
由意大利无偿援助三百万美元、国家投资六百多万元人民币修建的西藏急救中心七月十八日开始运行，中心设立的『１２０』急救专线电话同时开通。


In [18]:
def find_max_length(features_vector):
    '''
    Find the value of the max sentence in the features_vector
    @param features_vector the vector created for the LSTM input
    @return the max length of an element of the features_vector
    '''
    maxim = 0
    i = 0
    for sent in features_vector:
        if(len(sent)>maxim):
            maxim = len(sent)
    return maxim

In [19]:
#find the max length of the features_vector
max_length = find_max_length(features_vector)
print(max_length)
max_label = find_max_length(label_vector)
print(max_label)

581
581


In [20]:
#FROM HERE, I USE NUMPY TO CREATE THE NPARRAYS
X = np.array(features_vector)
Y = np.array(label_vector)

In [23]:
def add_pads_array(vec,max_length):
    '''
    Add for each sentence a pad_array of zero values until the max_length of a sentence of
    feature vector
    @param vec the vector that contains the values
    @param the max length of an element of the vector
    '''
    sentences = []
    for x in vec:
        pad = []
        length = len(x)
        x = np.pad(x,(0,max_length-length),mode='constant')
        pad.append(x)
        sentences.append(pad)
    return sentences