In [1]:
from stanfordcorenlp import StanfordCoreNLP
import json
import re
import numpy as np

In [2]:
nlp = StanfordCoreNLP(r'stanford-corenlp-full-2018-02-27', lang='en')

In [3]:
article= """
Barack Obama was born in Honolulu, Hawaii, making him the first president not born in the contiguous United States. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black person to be president of the Harvard Law Review. After graduating, he became a civil rights attorney and an academic, teaching constitutional law at the University of Chicago Law School from 1992 to 2004. Turning to elective politics, he represented the 13th district from 1997 until 2004 in the Illinois Senate, when he ran for the U.S. Senate. Obama received national attention in 2004 with his March Senate primary win, his well-received July Democratic National Convention keynote address, and his landslide November election to the Senate. In 2008, he was nominated for president a year after his presidential campaign began, and after close primary campaigns against Hillary Clinton, Obama was elected over Republican John McCain and was inaugurated alongside Joe Biden on January 20, 2009. Nine months later, he was named the 2009 Nobel Peace Prize laureate.
""".strip()

In [4]:
prep_replace_dict = [' he ',' He ',' him ']

In [5]:
for prep in prep_replace_dict:
    article = article.replace(prep,' Barack Obama ')


article = re.sub(" Barack Obama | Barack Obama|Barack Obama ",' Obama ',article)
article = re.sub("Obama",'Barack Obama',article)


article = article.replace(' his '," Barack Obama's ").strip()

In [6]:
nlp_annotate = nlp.annotate(article, properties={
    'annotators': 'tokenize,depparse,ner,openie,',
    'outputFormat': 'json'
    })

In [7]:
nlp_annotate = json.loads(nlp_annotate)

In [8]:
graph_dict = {}

In [9]:
articel_knowledge_list =[]

In [10]:
for sentence_annotate in nlp_annotate['sentences']:
    
    ### sentence by sentence
    
    tokens_result = sentence_annotate['tokens']
    
    dependencies_result  = sentence_annotate['enhancedPlusPlusDependencies']
    
    entity_result  = sentence_annotate['entitymentions']

    openie_result = sentence_annotate['openie']
    
    
    sentence_knowledge_list = []
    for knoledge_triple in openie_result:
        ### head entity token infomation
        
        head_entity_info_dict ={}
        head_entity = knoledge_triple['subject']
        head_entity_tokenBegin = knoledge_triple['subjectSpan'][0]
        head_entity_tokenEnd = knoledge_triple['subjectSpan'][1]
        head_entity_token_info = tokens_result[head_entity_tokenBegin:head_entity_tokenEnd]
        head_entity_lemma_name  = [token['lemma'] for token in head_entity_token_info]
        head_entity_lemma_name = ' '.join(head_entity_lemma_name)
        head_entity_ner = [token_info['ner'] for token_info in head_entity_token_info if token_info['ner']!='O']
        
        head_entity_info_dict['name'] = head_entity
        head_entity_info_dict['lemma_name']=head_entity_lemma_name
        head_entity_info_dict['ner']=head_entity_ner
        
        
        tail_entity_info_dict ={}
        tail_entity = knoledge_triple['object']
        ###tail entity token infomation
        tail_entity_tokenBegin = knoledge_triple['objectSpan'][0]
        tail_entity_tokenEnd = knoledge_triple['objectSpan'][1]
        tail_entity_token_info = tokens_result[tail_entity_tokenBegin:tail_entity_tokenEnd]
        tail_entity_lemma_name  = [token['lemma'] for token in tail_entity_token_info]
        tail_entity_lemma_name = ' '.join(tail_entity_lemma_name)
        tail_entity_ner = [token_info['ner'] for token_info in tail_entity_token_info if token_info['ner']!='O']
        if len(tail_entity_ner)==0:
            continue
        
        tail_entity_info_dict['name'] = tail_entity
        tail_entity_info_dict['lemma_name'] = tail_entity_lemma_name
        tail_entity_info_dict['ner']=tail_entity_ner
        
        
        relation_dict = {}
        relation = knoledge_triple['relation']
        relation_tokenBegin = knoledge_triple['relationSpan'][0]
        relation_tokenEnd = knoledge_triple['relationSpan'][1]
        relation_token_info = tokens_result[relation_tokenBegin:relation_tokenEnd]
        relation_lemma_name  = ' '.join([token['lemma'] for token in relation_token_info])
        
        relation_dict['relation']=relation
        relation_dict['lemma_relation'] = relation_lemma_name
    
        sentence_knowledge_list.append((head_entity_info_dict,tail_entity_info_dict,relation_dict))
        
    articel_knowledge_list.append(sentence_knowledge_list)
        
    

In [11]:
for sentence_knoledge_list in articel_knowledge_list:
    
    for (head,tail,relation) in sentence_knoledge_list:

        head_entity_name = head['name']
        try:
            head_entity_info_dict = graph_dict[head_entity_name]
            previous_relation_dict =  head_entity_info_dict['relations']
            add_relation_name = relation['relation']

            try:
                ###  relation already exists
                relation_list = previous_relation_dict[add_relation_name]

                tail_info_dict = {}
                tail_info_dict['name'] = tail['name']
                tail_info_dict['lemma_name'] = tail['lemma_name']
                tail_info_dict['ner'] = tail['ner']
                relation_list.append(tail_info_dict)
                previous_relation_dict[add_relation_name] = relation_list
                head_entity_info_dict['relations'] = previous_relation_dict

            except KeyError:###  relation doesn't exists
                tail_info_dict = {}
                tail_info_dict['name'] = tail['name']
                tail_info_dict['lemma_name'] = tail['lemma_name']
                tail_info_dict['ner'] = tail['ner']

                previous_relation_dict[add_relation_name] = [tail_info_dict]
                head_entity_info_dict['relations'] = previous_relation_dict

        except KeyError:

            knowledge_dict = {}
            knowledge_dict['lemma_name'] = head['lemma_name']
            knowledge_dict['ner'] = head['ner']

            tail_info_dict = {}
            tail_info_dict['name'] = tail['name']
            tail_info_dict['lemma_name'] = tail['lemma_name']
            tail_info_dict['ner'] = tail['ner']

            relation_dict = {}
            relation_dict[relation['relation']] = [tail_info_dict]

            knowledge_dict['relations'] = relation_dict
            graph_dict[head_entity_name] = knowledge_dict
    
    

In [12]:
graph_dict.keys()

dict_keys(['Barack Obama', 'organizer', 'University of Chicago Law School', 'Senate', "Barack Obama 's presidential campaign", "Barack Obama 's campaign", 'John McCain'])

In [13]:
def save_json(obj,filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)

In [14]:
save_json(graph_dict,'./save/graph_dict.json')

In [15]:
def open_json(filename):
    with open(filename,mode='r',encoding='utf-8') as f:
        return json.load(f)

In [16]:
graph_dict = open_json('./save/graph_dict.json')

In [17]:
question = 'When was Barack Obama named the Nobel Peace Prize laureate ?'

In [22]:
graph_dict['Barack Obama']['relations'].keys()

dict_keys(['was born in', 'making', 'worked as', 'worked', 'graduating in', 'graduating from', 'was', 'was person', 'be', 'be president of', 'was first black person', 'was first person', 'was black person', 'enrolled In', 'became', 'represented', 'received attention in', 'received attention with', 'has', 'was nominated', 'later was named', 'was named'])

In [18]:
# bert-serving-start -model_dir 'E:\MyFiles\WorkSpace\BertModels\uncased_L-12_H-768_A-12' -num_worker=1 -max_seq_len=25
# bert-serving-start -model_dir 'E:\MyFiles\WorkSpace\BertModels\uncased_L-2_H-128_A-2' -num_worker=1 -max_seq_len=25

In [19]:
from bert_serving.client import BertClient

In [21]:
bc = BertClient(port=6666,port_out=6667)
# bc = BertClient()

In [31]:
def answer_retrive(question,graph_dict):
    
    question_annotate = nlp.annotate(question, properties={
        'annotators': 'tokenize,depparse,ner,openie,',
        'outputFormat': 'json'
        })
    
    
    ### 1) question part of speech analysis
    question_annotate = json.loads(question_annotate)
    
    print(question_annotate['sentences'][0]['openie'])
    
    knowledge_triple = question_annotate['sentences'][0]['openie'][0]
    
    head_entity = knowledge_triple['subject']
#     print(head_entity)

    relation= knowledge_triple['relation']
    
    
    
    def retrive_sim(name,graph_names_list):
        names_list = [name] + graph_names_list

        names_encode = bc.encode(names_list)

        bert_encode = names_encode[0]

        names_sim_dict = {}
        for (encode,graph_name) in zip(names_encode[1:],graph_names_list):
            names_sim_dict[graph_name] = np.dot(bert_encode,encode)

        names_sim_dict = {k: v for k, v in sorted(names_sim_dict.items(), key=lambda item: item[1],reverse=True)}

        return list(names_sim_dict.keys())[0]
    
    
    
    tail_entity_is_none=False
    try:
        tail_entity = knowledge_triple['object']
    except:
        tail_entity_is_none=True

    #### search head entity
    try:
        head_entity_info = graph_dict[head_entity]
    except KeyError:
        print("I'm unable to answer your question !")

    graph_realtion_list = list(head_entity_info['relations'].keys())

    relation_retrived = retrive_sim(relation,graph_realtion_list)
    
    tail_entity_candidate_list = head_entity_info['relations'][relation_retrived]
    
    if tail_entity_is_none:
        ### raddom return ---> need to improve the logic
        tail_entity_info = random.sample(tail_entity_candidate_list,1)[0]
        tail_entity_name = tail_entity_info['name']
    else:
        tail_entity_candidate_name_list = [entity['name'] for entity in tail_entity_candidate_list]
        tail_entity_retrived = retrive_sim(tail_entity,tail_entity_candidate_name_list)


    #### return result
    return ' '.join([head_entity,relation_retrived,tail_entity_retrived])
    

In [32]:
question = 'When was Barack Obama named the Nobel Peace Prize laureate ?'

In [33]:
answer_retrive(question,graph_dict)

[{'subject': 'Barack Obama', 'subjectSpan': [2, 4], 'relation': 'was named', 'relationSpan': [1, 2], 'object': 'Nobel Peace Prize laureate', 'objectSpan': [6, 10]}]


'Barack Obama was named 2009 Nobel Peace Prize laureate'

In [42]:
graph_dict

{'Barack Obama': {'lemma_name': 'Barack Obama',
  'ner': ['PERSON', 'PERSON'],
  'relations': {'was born in': [{'name': 'Honolulu',
     'lemma_name': 'Honolulu',
     'ner': ['CITY']},
    {'name': 'Hawaii', 'lemma_name': 'Hawaii', 'ner': ['STATE_OR_PROVINCE']}],
   'making': [{'name': 'president',
     'lemma_name': 'president',
     'ner': ['TITLE']},
    {'name': 'Barack Obama',
     'lemma_name': 'Barack Obama',
     'ner': ['PERSON', 'PERSON']},
    {'name': 'first president',
     'lemma_name': 'first president',
     'ner': ['ORDINAL', 'TITLE']}],
   'worked as': [{'name': 'community organizer',
     'lemma_name': 'community organizer',
     'ner': ['TITLE']},
    {'name': 'community organizer in Chicago',
     'lemma_name': 'community organizer in Chicago',
     'ner': ['TITLE', 'CITY']}],
   'worked': [{'name': 'graduating from Columbia University in 1983',
     'lemma_name': 'graduate from Columbia University in 1983',
     'ner': ['ORGANIZATION', 'ORGANIZATION', 'DATE']},
 

In [45]:
question = 'Which year did Barack Obama graduate ?'

In [46]:
answer_retrive(question,graph_dict)

[]


IndexError: list index out of range