In [37]:
import tensorflow as tf
import numpy as np

In [355]:
import json
import string
from string import punctuation

In [36]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.test.utils import datapath

model2 = gensim.models.KeyedVectors.load_word2vec_format('D:/Word embedding/GoogleNews-vectors-negative300.bin',binary=True)

In [180]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

import evaluate
import sys
import os

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    l = []
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        l.append(subtree.leaves())
    
    return l

def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    #word = stemmer.stem(word)
    #word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted


def get_terms(tree):
    kp = []
    for leaf in leaves(tree):
        term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
        if term:
            kp.append(term)
    
    return kp


In [497]:
def get_kp(text):
    
    # Used when tokenizing words
    sentence_re = r'''(?x)      # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.\.\.              # ellipsis
          | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
        '''

    #lemmatizer = nltk.WordNetLemmatizer()
    #stemmer = nltk.stem.porter.PorterStemmer()

    #Taken from Su Nam Kim Paper
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
    """

    #toks = nltk.regexp_tokenize(text, sentence_re)
    postoks = nltk.tag.pos_tag(text)
    chunker = nltk.RegexpParser(grammar)
    tree = chunker.parse(postoks)
    terms = get_terms(tree)
    pos,pos_set = find_positions(text,terms)
    return terms,pos,pos_set

In [182]:
data = 'Inspec/docsutf8/'

In [604]:
keys = 'Inspec/keys/'

In [5]:
files = os.listdir(data)
key_files = os.listdir(keys)

In [1399]:
def find_positions(document, kps):
    ''' 
    merge the same kps & keep present kps in document
    Inputs:
        document : a word list : ['sun', 'sunshine', ...] || lower cased
        kps : can have more than one kp : [['sun'], ['key','phrase'], ['sunshine']] || not duplicate
    Outputs:
        all_present_kps : present keyphrases
        positions_for_all : start_end_posisiton for prensent keyphrases
        a present kp postions list : every present's positions in documents, 
        each kp can be presented in several postions .
        [[[0,0],[20,21]], [[1,1]]]
    '''
    tot_doc_char = ' '.join(document)
    
    positions_for_all = []
    position_start,position_end =[],[]
    all_present_kps = []
    for kp in kps:
        ans_string = ' '.join(kp)
        
        if ans_string not in tot_doc_char:
            continue
        else: 
            positions_for_each = []
            # find all positions for each kp
            for i in range(0, len(document) - len(kp) + 1):
                
                Flag = False
                if kp == document[i:i+len(kp)]:
                    Flag = True
                if Flag:
                    assert len(kp) >= 1
                    positions_for_each.append((i+1, i+len(kp)))
                    position_start.append(i+1)
                    position_end.append(i+len(kp))
        if len(positions_for_each) > 0 :
            positions_for_all.extend(positions_for_each)
            all_present_kps.append(kp)
           
    assert len(positions_for_all) >= len(all_present_kps)
    
    if len(all_present_kps) == 0:
        return None
    return [position_start,position_end],set(positions_for_all)

In [384]:
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text

In [None]:
def calculate_f1(y_labels,y_preds,depth,levels):
    precision = []
    recall = []
    f1 = []
    
    for idx,y_label in enumerate(y_labels):
        tp = 0
        p = []
        r = []
        y_label = set(np.where(y_label==1)[0])
        #print(y_preds[idx])
        preds = np.where(y_preds[idx]>0.5)[0]
        for i in range(depth):
            if len(preds)>i:
                if preds[i] in y_label:
                    tp+=1
            p.append(tp/(min(i,len(preds))+1))
            r.append(tp/max(len(y_label),1))
    
    
        level_index = []
        for idx,level in enumerate(levels):
            
            if p[level-1]+r[level-1]>0:
                level_index.append(2/((1/p[level-1])+(2/r[level-1])))
            else:
                level_index.append(0)
        #print('k',level_index)
        
        f1.append(level_index)
    f1 = np.array(f1)
    
    print('F1',np.mean(f1,axis=0))
    
                
            

In [461]:
sentence_re = r'''(?x)      # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.\.\.              # ellipsis
          | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
        '''
text_toc = nltk.regexp_tokenize(text, sentence_re)
candidates = get_kp(text)
positions = find_answer(text_toc,references[-1]['KeyPhrases'])
positions

Quadratic programming algorithms largescale model predictive control Quadratic programming QP methods important element application


In [325]:
candidates = []
references = []

for file in files[:10]:
    with open(data+file, 'r') as in_file:

        
        text = in_file.read()
        candidates.append({'url':file,
                            'KeyPhrases':get_kp(text)})
    
    name = file.split('.')[0]
    with open(keys+name+'.key', 'r') as in_file:

        can = in_file.readlines()
        can = [line.rstrip('\n').split() for line in can]
        references.append({'url':file,
                            'KeyPhrases':can})
    



with open('result.json', 'w') as out_file:
    for candidate in candidates:
        json.dump(candidate, out_file)
        out_file.write('\n')
with open('keys.json', 'w') as out_file:
    for ref in references:
        json.dump(ref, out_file)
        out_file.write('\n')

In [1526]:
max_len = 0
max_kp = 0
min_len =1e100
all_reps = []
key_positions = []
ref_positions = []
for file in files[:150]:
    with open(data+file, 'r') as in_file:
        text = in_file.read()
        txt = remove_punctuation(text)
        text_toc = nltk.regexp_tokenize(txt, sentence_re)
        kps,pos,pos_set = get_kp(text_toc)
        #print(pos)
        key_positions.append(pos_set)
        idx = [tok in model2.wv.vocab for tok in text_toc]
        rep = np.zeros((len(text_toc),300),dtype=float)
        rep[idx] = model2[np.array(text_toc)[idx]]  
        all_reps.append(rep)
        max_len = max(max_len,len(text_toc))
        min_len = min(min_len,len(text_toc))
        max_kp = max(max_kp,len(pos_set))
    
    name = file.split('.')[0]
    with open(keys+name+'.key', 'r') as in_file:

        can = in_file.readlines()
        can = [line.rstrip('\n').split() for line in can]
        ref_pos,ref_set = find_positions(text_toc,can)
        ref_positions.append(ref_set)
print(max_len,min_len)

  from ipykernel import kernelapp as app


260 27


In [1402]:
find_positions(text_toc,can)

In [1527]:
new_rep = []
for rep in all_reps:
    
    new_rep.append(tf.pad(rep,[[0,max_len-rep.shape[0]],[0,0]]))  
    

In [1415]:

masking_layer = tf.keras.layers.Masking()
unmasked_embedding = tf.cast(new_rep, tf.float32)

masked_embedding = masking_layer(unmasked_embedding)

In [1528]:
y_label = []
start_pos = []
end_pos = []
final_positions = []
pos_mask = []
final_kp_list = []
for idx,kp in enumerate(key_positions):
    
    kp = list(kp)
    y_val = [1 if key in ref_positions[idx] else 0 for key in kp]
    y_val.extend([0]*(max_kp-len(kp)))
    start =[key[0]-1 for key in kp]
    end = [key[1]-1 for key in kp]
    final_positions.append(tf.pad([start,end],[[0,0],[0,max_kp-len(start)]]))
    pos_mask.append([True]*len(start)+[False]*(max_kp-len(start)))
    y_label.append(y_val)
    final_kp_list.append(kp)

In [1412]:
len(key_positions)

195

In [None]:
class KPE(object):
    
    def __init__(self,max_sentence_len,max_keyphrases):
        
        self.max_sentence_len = max_sentence_len
        self.max_keyphrases = max_keyphrases
        
        
        self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))
        
        
    
        

In [1218]:
class RNNextractor(tf.keras.layers.Layer):
    
    def __init__(self):
        super(RNNextractor, self).__init__()
        self.bilstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,
                                                                         return_sequences=True),
                                                     merge_mode=None,
                                                    input_shape=(300,204,))
        self.bilstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(5,
                                                                         return_sequences=True),
                                                     merge_mode='ave',
                                                     input_shape=(48,100*4))
        self.dense = tf.keras.layers.Dense(1,activation='sigmoid')
        
    def call(self,inputs):
                                                    
        x = self.bilstm1(inputs[0])
        print(x[0].shape)
        #print(inputs[1])
        mask_start = inputs[1][0]
        #mask_start[mask_start>0] = mask_start[mask_start>0]-1
        mask_end = inputs[1][1]
        #mask_end[mask_end>0] = mask_end[mask_end>0]-1
        print(mask_end.shape)
        start_rep_fr = tf.gather(x[0],mask_start)
        start_rep_bk = tf.gather(x[1],mask_start)
        end_rep_fr = tf.gather(x[0],mask_end)
        end_rep_bk = tf.gather(x[0],mask_end)
        
        print(start_rep_bk.shape)
        span_fe_diff_fr = start_rep_fr-end_rep_fr
        span_fe_prod_fr = tf.math.multiply(start_rep_fr,end_rep_fr)
        span_fe_diff_bk = start_rep_bk-end_rep_bk
        span_fe_prod_bk = tf.math.multiply(start_rep_bk,end_rep_bk)
        print(span_fe_diff_bk.shape)
        span_fe = tf.concat([start_rep_fr,
                             end_rep_fr,
                             start_rep_bk,
                             end_rep_bk,
                             span_fe_diff_fr,
                             span_fe_diff_bk,
                             span_fe_prod_fr,
                             span_fe_prod_bk
                            ],2)
        
        print(span_fe.shape)
        x = self.bilstm2(span_fe)
        x =  self.dense(x)
        return x
        
        
        

In [1202]:
extractor = RNNextractor()


In [1203]:
f = tf.random.normal([10,5])
pos = tf.constant([[1,5,8],[2,6,9]])


In [1529]:
x_train = tf.transpose(new_rep,perm=[0,1,2])
x_pos = tf.stack(final_positions)
#x_pos = tf.tile(tf.expand_dims(x_pos,-1),[1,1,1,100])
y_train = tf.stack(y_label)
y_train = tf.cast(y_train,dtype=float)

In [1259]:
tf.stack(final_positions).shape

TensorShape([20, 2, 48])

In [1205]:
res = extractor([x_train[:,0:1,:],x_pos[:,:,0,0]])
res.shape



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

(204, 1, 100)
(48,)
(48, 1, 100)
(48, 1, 100)
(48, 1, 800)


TensorShape([48, 1, 1])

In [1208]:
tf.gather(res[:,0,0]-y_train[0],np.where(pos_mask[0])[0])

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.        ,  0.        , -1.        ,  0.00121441], dtype=float32)>

In [1531]:
y_train.shape

TensorShape([150, 77])

In [1560]:
ip1 = tf.keras.layers.Input(shape=(260,300))
mask = tf.keras.layers.Masking(mask_value=0.0)(ip1)
#custom_mask = tf.keras.layers.Input(shape=(100,))
bilstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,
                                                      return_sequences=True),
                                                     merge_mode=None)(mask)
ip2 = tf.keras.layers.Input(shape=(2,77),dtype='int32')
mask_start = ip2[0][0]
mask_end = ip2[0][1]

start_rep_fr = tf.gather(bilstm1[0],mask_start,axis=1)
start_rep_bk = tf.gather(bilstm1[1],mask_start,axis=1)
end_rep_fr = tf.gather(bilstm1[0],mask_end,axis=1)
end_rep_bk = tf.gather(bilstm1[0],mask_end,axis=1)


span_fe_diff_fr = start_rep_fr-end_rep_fr
span_fe_prod_fr = tf.math.multiply(start_rep_fr,end_rep_fr)
span_fe_diff_bk = start_rep_bk-end_rep_bk
span_fe_prod_bk = tf.math.multiply(start_rep_bk,end_rep_bk)


span_fe = tf.keras.layers.concatenate([start_rep_fr,
                     end_rep_fr,
                     start_rep_bk,
                     end_rep_bk,
                     span_fe_diff_fr,
                     span_fe_diff_bk,
                     span_fe_prod_fr,
                     span_fe_prod_bk
                    ],2)
bilstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(5,return_sequences=True),
                                         merge_mode='ave',
                                         input_shape=(48,100*4))(span_fe)
output = tf.keras.layers.Dense(1,activation='sigmoid',input_shape=(5,))(bilstm2)
 
#output = RNNextractor()([ip1,ip2])

In [1561]:
kpe_model = tf.keras.models.Model(inputs=[ip1,ip2], outputs=output)

# model = tf.keras.Sequential(
#     [ 
#         RNNextractor()
#     ]
# )

# # Compile the model
# model.compile(
#     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#     optimizer=tf.keras.optimizers.Adam(),
#     metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
# )

# # Train the model
# model.fit([x_train[:,0:1,:],x_pos[:,:,0,0]], y_train[0], batch_size=1, epochs=1)
# print(model.summary())


# # Test the model
# #model.evaluate(x_test, y_test)

In [1562]:
print(kpe_model.summary())

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_58 (InputLayer)           [(None, 260, 300)]   0                                            
__________________________________________________________________________________________________
input_59 (InputLayer)           [(None, 2, 77)]      0                                            
__________________________________________________________________________________________________
masking_14 (Masking)            (None, 260, 300)     0           input_58[0][0]                   
__________________________________________________________________________________________________
tf_op_layer_strided_slice_62 (T [(2, 77)]            0           input_59[0][0]                   
___________________________________________________________________________________________

In [1585]:
opt = tf.keras.optimizers.Adamax(learning_rate=0.1)
kpe_model.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [1362]:
pos_weight = tf.cast(pos_mask,dtype='int32')

In [None]:
kpe_model.fit([x_train[:90],x_pos[:90]], y_train[:90], 
              batch_size=24,epochs=10,
              use_multiprocessing=True,validation_split=0.2)


Train on 72 samples, validate on 18 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#results = kpe_model.evaluate([x_train[90:],x_pos[90:]], y_train[90:], batch_size=8)
#print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = kpe_model.predict([x_train[90:],x_pos[90:]])
print("predictions shape:", predictions.shape)


In [None]:
calculate_f1(y_train[90:],predictions,30,[5,10])