In [86]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from itertools import product
from gensim.models import KeyedVectors

filename = './GoogleNews-vectors-negative300.bin'

remove_list = ['do','Do','Does','does','did','Did',
               'is','Is','was','Was','am','Am','Are','are','were','Were',
               'What','what','How','how','When','when','Where','where','Who','who']

stop_words = list(set(stopwords.words('english')))
stop_words.extend(remove_list)

nlp = spacy.load("en_coref_lg")
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [128]:
def following_up_question_chk(qn,qp):
    #qn - question now
    #qp - question past
    verb_detection = False
    coref_no_detection = True
    text_connection = qp + ' ' + qn
    doc = nlp(text_connection)
    qn_tokens = word_tokenize(qn)
    qn_tokens = [w for w in qn_tokens if not w in stop_words] 
    qp_tokens = word_tokenize(qp)
    qp_tokens = [w for w in qp_tokens if not w in stop_words]
    
    qn_verb_detect_str = ' '.join(qn_tokens)
    qp_verb_detect_str = ' '.join(qp_tokens)
    
    #print(qn_verb_detect_str)
    #print(qp_verb_detect_str)
    
    qn_noun_list = []
    qp_noun_list = []
    
    
    qn_modified = qn
    #check qn-question now has pronoun and possessive adjective references to qp
    try:
        if len(doc._.coref_clusters) >0:
            coref_no_detection = False
            print('Co-references detected')
            for items in doc._.coref_clusters:
                print(items.mentions)
                print('Original question: ' + qn)
                print('Are you asking: ' + qn.replace(str(items.mentions[1]),str(items.mentions[0])) )
                qn_modified = qn_modified.replace(str(items.mentions[1]),str(items.mentions[0]))
                qn_modified_tokens = qn_modified.split()
                qn_modified_tokens = [word.lower() for word in qn_modified_tokens if word not in stop_words]
            print('-'*30)
    except:
        print('Co-references are not detected')
        print('-'*30)
        qn_modified_tokens = qn_modified.split()
        qn_modified_tokens = [word.lower() for word in qn_modified_tokens if word not in stop_words]
        coref_no_detection = True
        
    #check any verb in qn-question now 
    for token in nlp(qn_verb_detect_str):
        if token.pos_=='VERB':
            verb_detection = True
            print('A verb ' + str(token) + ' was detected')
            
    if verb_detection==False:
        print('No verb was detected')
    
                                              
    for token in nlp(qp):
        if (token.pos_=='NOUN' or token.pos_=='PROPN'):
            qp_noun_list.append(str(token))
            
    qp_noun_list = [x for x in qp_noun_list if x not in remove_list]
    
    for token in nlp(qn_modified):
        if (token.pos_=='NOUN' or token.pos_=='PROPN'):
            qn_noun_list.append(str(token))
    qn_noun_list = [x for x in qn_noun_list if x not in remove_list]
    
    print('-'*30)        
    print('Previous question noun is: ' + str(qp_noun_list))
    print('Current question noun is: ' + str(qn_noun_list))
    print('-'*30)
    
    qn_noun_str = ' '.join(qn_noun_list)
    qp_noun_str = ' '.join(qp_noun_list)
    
    noun_group_similarity = nlp(qn_noun_str).similarity(nlp(qp_noun_str))
    
    '''
    Google pretrained word2vec to get wmdDistance feature
    '''
    wmd = model.wmdistance(qp_tokens, qn_modified_tokens)
    '''
    Google pretrained word2vec most similar words
    '''
    words_pair_list = []
    w2v_word_similarity_list = []
    for word_1 , word_2 in product(qp_noun_list, qn_noun_list):
        similarity = model.similarity(word_1,word_2)
        w2v_word_similarity_list.append(similarity)
        words_pair_list.append((word_1,word_2,similarity))
    
    try:
        max_word_similarity = max(w2v_word_similarity_list)
        ind = int(w2v_word_similarity_list.index(max_word_similarity))
        word_pairs = words_pair_list[ind] 
        word_p = word_pairs[0]
        word_n = word_pairs[1]
        word_sim = word_pairs[2]

        '''
        Below is for synsets check, it is not necessary to enable
        '''
        #qn_noun_synsets = set(ss for word in qn_noun_list for ss in wordnet.synsets(word))
        #qp_noun_synsets = set(ss for word in qp_noun_str for ss in wordnet.synsets(word))
        #try:
        #    best_synsets = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in product(qn_noun_synsets, qp_noun_synsets))
        #except:
        #    best_synsets = 0
        #print(verb_detection)
        #print(coref_no_detection)
        #print(verb_detection and coref_no_detection)
        if verb_detection and coref_no_detection:
            print('It is not a following up question')
            print('Two questions wmDistance is: ' + str(wmd))
            print('NN similarity is: ' + str(noun_group_similarity))
            print(word_p + ' and ' + word_n +' are the most relevent words "("Based on Google w2v similarity which is: ' + str(word_sim) +" )")
            #print('Bset NN Synset is: ' + str(best_synsets))
            return False
        else:
            print('It is a following up question')
            print('Two questions wmDistance is: ' + str(wmd))
            print('NN similarity is: ' + str(noun_group_similarity))
            print(word_p + ' and ' + word_n +' are the most relevent words "("Based on Google w2v similarity which is: ' + str(word_sim) +" )")
            #print('Bset NN Synset is: ' + str(best_synsets))
            return True
    except:
        print('#'*30)
        print('Empty noun list was detected')
        print('#'*30)
        if verb_detection and coref_no_detection:
            print('It is not a following up question')
            print('Two questions wmDistance is: ' + str(wmd))
            print('NN similarity is: ' + str(noun_group_similarity))
            #print('Bset NN Synset is: ' + str(best_synsets))
            return False
        else:
            print('It is a following up question')
            print('Two questions wmDistance is: ' + str(wmd))
            print('NN similarity is: ' + str(noun_group_similarity))
            #print('Bset NN Synset is: ' + str(best_synsets))
            return True

# WMD_Distance Walkaround <=Google 300 is normalised

In [14]:
##Load Model
#from gensim.models import KeyedVectors
#filename = './GoogleNews-vectors-negative300.bin'
#model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [125]:
#sent_1 = 'Did you get all the information you wanted using the system?'
#sent_2 = 'How easy was the system to obtain'

In [126]:
#sent_1_token = sent_1.split()
#sent_1_token = [w.lower() for w in sent_1_token if w not in stop_words]
#sent_2_token = sent_2.split()
#sent_2_token = [w.lower() for w in sent_2_token if w not in stop_words]

In [127]:
#distance_1_2 = model.wmdistance(sent_1_token, sent_2_token)
#print(distance_1_2)
#distance_2_1 = model.wmdistance(sent_2_token, sent_1_token)
#print(distance_2_1)

2.963588026107289
2.963588026107289


In [64]:
#for s1,s2 in product(['Information','System'],['System']):
#    print(model.similarity(s1,s2))

0.16371556
0.99999994


In [21]:
#check_1 = ['information', 'system']
#check_2 = ['system']
#distance = model.wmdistance(check_1, check_2)
#print(distance)

1.7184451818466187


# Evaluation

In [129]:
'''
Captured correctly
'''
qp = 'Did you get all the information you wanted using the system?'
qn = 'How easy was it to obtain'
following_up_question_chk(qn,qp)

Co-references detected
[the system, it]
Original question: How easy was it to obtain
Are you asking: How easy was the system to obtain
------------------------------
A verb obtain was detected
------------------------------
Previous question noun is: ['information', 'system']
Current question noun is: ['system']
------------------------------
It is a following up question
Two questions wmDistance is: 2.3765532010057937
NN similarity is: 0.8528664493365101
system and system are the most relevent words "("Based on Google w2v similarity which is: 1.0 )


True

In [131]:
'''
Captured wrongly
'''
qp = 'what is the biggest city in Finland?'
qn = 'how many people live there??'
following_up_question_chk(qn,qp)

Co-references are not detected
------------------------------
A verb live was detected
------------------------------
Previous question noun is: ['city', 'Finland']
Current question noun is: ['people']
------------------------------
It is not a following up question
Two questions wmDistance is: 3.2308872767218
NN similarity is: 0.35229659112058176
city and people are the most relevent words "("Based on Google w2v similarity which is: 0.23614499 )


False

In [132]:
'''
Captured correctly
'''
qp = 'Did you get all the information you wanted using the system?'
qn = 'How many people live there'
following_up_question_chk(qn,qp)

Co-references are not detected
------------------------------
A verb live was detected
------------------------------
Previous question noun is: ['information', 'system']
Current question noun is: ['people']
------------------------------
It is not a following up question
Two questions wmDistance is: 2.9295614928729514
NN similarity is: 0.42065372089634406
information and people are the most relevent words "("Based on Google w2v similarity which is: 0.12545776 )


False

In [133]:
'''
Captured correctly
'''
qp = 'when did Glove Cleveland die?'
qn = 'how old was he?'
following_up_question_chk(qn,qp)

Co-references detected
[Glove Cleveland, he]
Original question: how old was he?
Are you asking: how old was Glove Cleveland?
------------------------------
No verb was detected
------------------------------
Previous question noun is: ['Glove', 'Cleveland']
Current question noun is: ['Glove', 'Cleveland']
------------------------------
It is a following up question
Two questions wmDistance is: 3.6464367404870766
NN similarity is: 1.0
Glove and Glove are the most relevent words "("Based on Google w2v similarity which is: 1.0 )


True

In [134]:
qp = 'where was james born?'
qn = 'who did he marry?'
following_up_question_chk(qn,qp)

Co-references detected
[james, he]
Original question: who did he marry?
Are you asking: who did james marry?
------------------------------
A verb marry was detected
------------------------------
Previous question noun is: ['james']
Current question noun is: ['james']
------------------------------
It is a following up question
Two questions wmDistance is: 2.34818434715271
NN similarity is: 1.0
james and james are the most relevent words "("Based on Google w2v similarity which is: 1.0 )


True

In [135]:
'''
Captured correctly
'''
qp = 'Did you get all the information you wanted using the system?'
qn = 'who did he marry?'
following_up_question_chk(qn,qp)

Co-references are not detected
------------------------------
A verb marry was detected
------------------------------
Previous question noun is: ['information', 'system']
Current question noun is: []
------------------------------
##############################
Empty noun list was detected
##############################
It is not a following up question
Two questions wmDistance is: inf
NN similarity is: 0.0


False

In [136]:
'''
Captured correctly
'''
qp = 'what is P/E ratio of Apple?'
qn = 'how about Microsoft?'
following_up_question_chk(qn,qp)

Co-references are not detected
------------------------------
No verb was detected
------------------------------
Previous question noun is: ['P', 'E', 'ratio', 'Apple']
Current question noun is: ['Microsoft']
------------------------------
It is a following up question
Two questions wmDistance is: inf
NN similarity is: 0.3513736087337575
Apple and Microsoft are the most relevent words "("Based on Google w2v similarity which is: 0.52860063 )


True

In [137]:
'''
Captured correctly
'''
qp = 'what is P/E ratio of Apple?'
qn = 'how about the stock price?'
following_up_question_chk(qn,qp)

Co-references are not detected
------------------------------
No verb was detected
------------------------------
Previous question noun is: ['P', 'E', 'ratio', 'Apple']
Current question noun is: ['stock', 'price']
------------------------------
It is a following up question
Two questions wmDistance is: 4.015905484418869
NN similarity is: 0.3615854569650708
ratio and stock are the most relevent words "("Based on Google w2v similarity which is: 0.25689003 )


True

In [91]:
#doc2 = nlp('My sister has a dog. She loves him')

In [97]:
#doc2._.coref_clusters

In [5]:
#doc = nlp('Did you get all the information you wanted using the system? How easy was it to obtain the information you want')

#doc._.coref_clusters[0].mentions

[the system, it]

In [8]:
#doc1 = nlp("Did you get all the information you wanted using system?")
#doc2 = nlp("How easy to obtain information you want")
#doc1.similarity(doc2)

0.9180178381713632

In [9]:
#doc1[0].pos_

'VERB'

In [6]:
#doc._.coref_clusters[0]

the system: [the system, it]

In [11]:
#for token in doc2:
#    print(token.pos_)

ADV
ADJ
PART
VERB
NOUN
PRON
VERB
