In [1]:
import nltk
import re
import pprint
import en_coref_md
from nltk import Tree
from ipynb.fs.full.stanford_open_ie_python_wrapper import stanford_ie
import os

In [2]:
patterns = """
    NP:    {<DT><WP><VBP>*<RB>*<VBN><IN><NN>}
           {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+}
           {<JJ|JJR|JJS>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
           {<JJ|JJR|JJS>*<NN|NNS|NNP|NNPS>+}
           
    """

NPChunker = nltk.RegexpParser(patterns)
coref_parser = en_coref_md.load()
stanford_corenlp_path = "/Users/krishna.aruru/stanfordnlp_resources/stanford-corenlp-full-2018-10-05"
os.environ["CORENLP_HOME"] = "/Users/krishna.aruru/stanfordnlp_resources/stanford-corenlp-full-2018-10-05"

In [3]:
def convert_to_tree(text):
    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    sentences = [NPChunker.parse(sent) for sent in sentences]
    return sentences

def get_noun_phrases(text):
    sentences = convert_to_tree(text)
    nps = []
    for sent in sentences:
        tree = NPChunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == "NP":
                nps.append(" ".join([word for word, _ in subtree.leaves()]))
    return nps

In [4]:
def get_corefs(paragraph):
    doc = coref_parser(paragraph)
    refs = {}
    if doc._.has_coref:
        for cluster in doc._.coref_clusters:
            for mention in cluster.mentions:
                refs[mention.start_char] = ( mention.end_char, cluster.main.text)
    return refs

def deref_text(sentence, coref_mapping):
    output = ""
    i = 0
    while i < len(sentence):
        if i in coref_mapping:
            pos, replacement = coref_mapping[i]
            output += replacement
            if pos == i:
                i += 1
            else:
                i = pos
        else:
            output += sentence[i]
            i += 1
    return output


In [17]:
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    output = []
    for word in nltk.tokenize.word_tokenize(text):
        if not word.lower() in stopwords:
            output.append(word)
    removed = " ".join(output)
    if not removed:
        return text
    return removed

def get_ngrams(sentence):
    print("Getting n grams for: {}".format(sentence))
    tokens = nltk.word_tokenize(sentence)
    req_n_grams = list()
    if len(tokens) <= 4:
        return [sentence]
    for i in range(4, len(tokens) + 1):
        n_grams = nltk.ngrams(tokens, i)
        for gram in n_grams:
            req_n_grams.append(" ".join(gram))
    return req_n_grams

def preprocess(paragraph, generate_ngrams=False):
    ref_mapping = get_corefs(paragraph)
    paragraph = deref_text(paragraph, ref_mapping)
    # ";" is used to seperate the subject, relation and object by both Stanford and OpenIE. 
    # Better to remove it from out text so that we don't get confused in output.
    paragraph.replace(";", "")
    paragraph = [line for line in nltk.sent_tokenize(paragraph)
                     if len(nltk.word_tokenize(line)) > 3]
    if generate_ngrams:
        paragraph = flatten([get_ngrams(line) for line in paragraph])
    return paragraph

def flatten(list_2d):
    return [item for sublist in list_2d for item in sublist]

def process_batch(batch, generate_ngrams=False):
    batch = [preprocess(line, generate_ngrams) for line in batch]
    batch = flatten(batch)
    nps = [get_noun_phrases(line) for line in batch]
    nps = set(flatten(nps))
    return batch, nps

def filter_relations(relations, nounphrases):
    rels = [(subj, relation, obj) for subj, relation, obj in relations if (subj in nounphrases or obj in nounphrases)]
    rels = [(remove_stopwords(subj), remove_stopwords(relation), remove_stopwords(obj)) 
            for subj, relation, obj in rels]
    return rels

def get_relations(paragraph):
    paragraph = preprocess(paragraph)
    nps = set(flatten([get_noun_phrases(line) for line in paragraph]))
    rels = stanford_ie(paragraph)
    return filter_relations(rels, nps)

In [None]:
# sentences = [line.strip() for line in open("vogue_non_empty_descriptions.txt").readlines()]
# sentences, nps = process_batch(sentences)

In [None]:
# len(sentences)

In [None]:
# results = []
# for i in range(0, len(sentences), 100):
#     print("Processing batch: {}".format(i/100))
#     rels = stanford_ie(sentences[i: i+100])
#     results.extend(filter_relations(rels, nps))

In [None]:
# with open("stanfordoie_outputs_without_ngrams.txt", "w") as fw:
#     for relation in results:
#         fw.write("|".join([x for x in relation]))
#         fw.write("\n")

In [19]:
sentence = "Bananas are excellent sources of potassium. They are also very tasty"
print(preprocess(sentence))

{0: (7, 'Bananas'), 44: (48, 'Bananas')}
['Bananas are excellent sources of potassium.', 'Bananas are also very tasty']


In [16]:
sen

['Bananas are excellent sources of potassium.', 'Bananas are also very tasty']