# Import Libraries

In [484]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os, sys
from sklearn.metrics.pairwise import cosine_similarity
from bert_serving.client import BertClient
import spacy
import textacy
from spacy import displacy
import en_core_web_sm
import math
# from itertools import cycle

# A = [1,2,3,4,5,6,7,8,9]
# B = ["A","B","C"]
# zip_list = list(zip(A, cycle(B)) if len(A) > len(B) else zip(cycle(A), B))
# list(zip_list)

# Load Language Models

Spacy provides downloadable pretrained language models for important natural language tasks such as tagging, parsing and entity recognition. 

A comphrensive list of available models can be seen at https://spacy.io/models/en. 

For the sake of this task, we are using **en_core_web_sm**. The following code expects this module is already available. If not, it can be downloaded by running
```
python -m spacy download en_core_web_sm
```

Models can be loaded using spaCy's build-in loader, or as a normal python module.

In [485]:
# nlp = spacy.load('en_core_web_sm')
nlp = en_core_web_sm.load()

In [522]:
sentence_1 = nlp("Feelings about current business conditions improved substantially from the first quarter, jumping from 40 to 55.")
sentence_2 = nlp("Assessment of current business conditions improved substantially, the Conference Board said, jumping to 55 from 40 in the first quarter.")

# sentence_1 = nlp("Charlie Chan is off the case for the Fox Movie Channel.")
# sentence_2 = nlp("The Fox Movie Channel has banned Charlie Chan.")



## Method 1
Chunking

## Method 2
Parsing

## Method 3


In [537]:
def extract_phrases(doc):
    phrases_extracted = []
    # print([token.text for token in doc[2].lefts])
    root = [token for token in doc if token.head == token][0]
    left_subtree_subjects = list(root.lefts)
    right_subtree_subjects = list(root.rights)
    subjects = left_subtree_subjects + right_subtree_subjects
    
    for subject in left_subtree_subjects:
        _phrases = []
        for descendant in subject.subtree:
            if descendant.dep_ != "punct":
                _phrases.append(descendant.text)
#             assert subject is descendant or subject.is_ancestor(descendant)
            print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, [ancestor.text for ancestor in descendant.ancestors])
        print("--")
#         print(root)
        if len(_phrases):
            phrases_extracted.append(" ".join(_phrases))

    phrases_extracted.append("".join([root.text]))
    
    for subject in right_subtree_subjects:
        _phrases = []
        for descendant in subject.subtree:
            if descendant.dep_ != "punct":
                _phrases.append(descendant.text)
            assert subject is descendant or subject.is_ancestor(descendant)
            print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, [ancestor.text for ancestor in descendant.ancestors])
        print("--")
#         print(root)
        if len(_phrases):
            phrases_extracted.append(" ".join(_phrases))
    
    return phrases_extracted

def extract_noun_verb_phrases(doc):
    phrases_extracted = []
    for chunk in doc.noun_chunks:
        print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)
        phrases_extracted.append(chunk.text)
#     phrases_extracted.append("".join([root.text]))
    # https://realpython.com/natural-language-processing-spacy-python/#dependency-parsing-using-spacy
#     pattern = r'(<VERB>?<ADV>*<VERB>+)'
#     doc_meta = textacy.make_spacy_doc(doc, lang='en_core_web_sm')
#     verb_phrases = textacy.extract.pos_regex_matches(doc_meta, pattern)
#     # Print all Verb Phrase
#     for chunk in verb_phrases:
#         phrases_extracted.append(chunk.text)
    return phrases_extracted

In [538]:
sentence_1_phrases = extract_noun_verb_phrases(sentence_1)
sentence_1_phrases

Feelings Feelings nsubj improved
current business conditions conditions pobj about
the first quarter quarter pobj from


['Feelings', 'current business conditions', 'the first quarter']

In [539]:
sentence_2_phrases = extract_noun_verb_phrases(sentence_2)
sentence_2_phrases

Assessment Assessment nsubj improved
current business conditions conditions pobj of
the Conference Board Board nsubj said
the first quarter quarter pobj in


['Assessment',
 'current business conditions',
 'the Conference Board',
 'the first quarter']

In [540]:
def calculate_similarity(pair):
    query_vec_1, query_vec_2 = bert_client.encode(pair)
    cosine = np.dot(query_vec_1, query_vec_2) / (np.linalg.norm(query_vec_1) * np.linalg.norm(query_vec_2))
    return 1/(1 + math.exp(-100*(cosine - 0.95)))

In [541]:
foo= []
with BertClient(port=5555, port_out=5556, check_version=False) as bert_client:        
    longer_phrase_length = sentence_1_phrases if len(sentence_1_phrases) > len(sentence_2_phrases) else sentence_2_phrases
    shorter_phrase_length = sentence_1_phrases if len(sentence_1_phrases) < len(sentence_2_phrases) else sentence_2_phrases
    
    for i in shorter_phrase_length:
        most_similar_j_index, most_similar_j = 0,0
        for index_j,j in enumerate(longer_phrase_length):
#             print(i,"\t\t\t", j, "\t\t\t", calculate_similarity([i,j]))
            i_j_similarity = calculate_similarity([i,j])
            if i_j_similarity >= most_similar_j:
                most_similar_j_index = index_j
                most_similar_j = i_j_similarity
        print(most_similar_j)
        foo.append([i, longer_phrase_length[most_similar_j_index]])

8.828242445860665e-07
0.9933072283262605
0.9933071490757153


In [542]:
foo

[['Feelings', 'Assessment'],
 ['current business conditions', 'current business conditions'],
 ['the first quarter', 'the first quarter']]

In [508]:
# sent=sent1_docx.sents
# sent.n_lefts
# for sentence in sent1_docx.sents:
#     rr = sentence.root
# rr.right_edge
# [t.text for t in rr.rights]
for chunk in sent1_docx.noun_chunks:
    print (chunk)
print("---")
for chunk in sent2_docx.noun_chunks:
    print (chunk)

Feelings
current business conditions
the first quarter
---
Assessment
current business conditions
the Conference Board
the first quarter


In [171]:
spacy.explain("S")

In [224]:
displacy.render(sent1_docx, style="dep")

In [223]:
tree_1 = displacy.render(sent1_docx, style="dep", jupyter=False)
tree_2 = displacy.render(sent2_docx, style="dep", jupyter=False)
with open('sent1_tree.svg','w',encoding='utf-8') as f:
    f.write(tree_1)
with open('sent2_tree.svg','w',encoding='utf-8') as f:
    f.write(tree_2)

<img src="sent1_tree.svg" alt="Drawing" style="width: 100%"/>
<img src="sent2_tree.svg" alt="Drawing" style="width: 100%"/>