In [1]:
import spacy

In [10]:
nlp = spacy.load('en')
doc = nlp(u'I like green eggs and ham.')
for np in doc.noun_chunks:
    print(np.text, np.root.text, np.root.dep_, np.root.head.text)

(u'I', u'I', u'nsubj', u'like')
(u'green eggs', u'eggs', u'dobj', u'like')
(u'ham', u'ham', u'conj', u'eggs')


In [11]:
for np in doc.noun_chunks:
    print(np.root.text, np.root.dep_, np.root.head.text)

(u'I', u'nsubj', u'like')
(u'eggs', u'dobj', u'like')
(u'ham', u'conj', u'eggs')


In [12]:
from spacy.symbols import nsubj, VERB
# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj or possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)


In [13]:
verbs

{like}

In [14]:
print doc.noun_chunks

<generator object at 0x10a482848>


In [15]:
from spacy.symbols import nsubj
doc = nlp(u'Credit and mortgage account holders must submit their requests within 30 days.')
root = [w for w in doc if w.head is w][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    print descendant
    print subject.is_ancestor_of(descendant)

Credit
True
and
True
mortgage
True
account
True
holders
False


In [16]:
print doc

Credit and mortgage account holders must submit their requests within 30 days.


In [17]:
doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        print(''.join(w.text_with_ws for w in word.subtree))


to show you how computers understand language
how computers understand language


In [18]:
doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        print word


show
understand


In [19]:
doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
for word in doc:
    print word, word.dep_

displaCy nsubj
uses ROOT
CSS dobj
and cc
JavaScript conj
to aux
show xcomp
you dobj
how advmod
computers nsubj
understand ccomp
language dobj


In [20]:
sent = 'displaCy uses CSS and JavaScript to show you how computers understand language'
doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
        print(subtree_span.text, '|', subtree_span.root.text)
        print(subtree_span.similarity(doc))
        print(subtree_span.similarity(subtree_span.root))

(u'to show you how computers understand language', '|', u'show')
0.925660118577
0.612422881378
(u'how computers understand language', '|', u'understand')
0.877747644146
0.836034566354


In [21]:
def dependency_labels_to_root(token):
    '''Walk up the syntactic tree, collecting the arc labels.'''
    dep_labels = []
    while token.head is not token:
        dep_labels.append(token.dep)
        print token.dep
        token = token.head
        
    return dep_labels

In [None]:
import sys
def showTree(sent):
    def __showTree(token):
        sys.stdout.write("{")
        [__showTree(t) for t in token.lefts]
        sys.stdout.write("%s->%s(%s)" % (token,token.dep_,token.tag_))
        [__showTree(t) for t in token.rights]
        sys.stdout.write("}")
    return __showTree(sent.root)

In [None]:
sent = u'displaCy uses CSS and JavaScript to show you how computers understand language'
# showTree(nlp(sent))

In [22]:
from spacy.en import English
parser = English()


In [23]:
import pandas as pd
data_df = pd.read_excel("Training Data/output_30nouns.xlsx")

In [24]:
for token in parser(u"Rodriguex is a great salesman."):
    print token.dep_

nsubj
ROOT
det
amod
attr
punct


In [25]:
# example = u"We traveled over 30 minutes by bus to check out this coffee shop and we were not disappointed. Worth the time. The coffee was so good. We ordered a coffee with almond, coconut oil flavor forgot the name. Wow! So delicious!!The store itself is small with a couple of outdoor tables. I think one of the best part of our experience is talking to the owner and him describing their coffee making process. Definitely going back for more. "
import pdb
from collections import defaultdict
keyword_dict = defaultdict(list)
pdb.set_trace()
for i,row in data_df.iterrows():
    parsedEx = parser(row['review'])
    for token in parsedEx:
        print token, token.dep_
        if token.dep_=="amod":
            print "here"
            keyword_dict[token.head.orth_].append(token.orth_)
    

--Return--
> <ipython-input-25-27bb62709ef1>(5)<module>()->None
-> pdb.set_trace()


KeyboardInterrupt: 

In [None]:
from collections import Counter,OrderedDict
keywords_count = OrderedDict()
keys_to_delete = []
for key in keyword_dict:
    if len(key)<=1 or len(key)>=15:
        keys_to_delete.append(key)
    else:
        keyword_dict[key] = Counter(keyword_dict[key])
        for key_key in keyword_dict[key]:
            if key in keywords_count:
                keywords_count[key] += keyword_dict[key][key_key]
            else:
                keywords_count[key] = keyword_dict[key][key_key]

In [None]:
for kd in keys_to_delete:
    if kd in keywords_count:
        del keywords_count[kd]

In [None]:
keyword_dict["deal"]

In [None]:
keywords_count = OrderedDict(sorted(keywords_count.iteritems(), key=lambda x: x[1], reverse=True))
keywords_count

In [None]:
from nltk.corpus import sentiwordnet as swn, wordnet as wn

In [None]:
def get_avg_pos_sentiment(keyword):
    sw_s = [sw_.pos_score() for sw_ in list(swn.senti_synsets(keyword,'a'))]
    if len(sw_s)>0:
        return sum(sw_s)/len(sw_s)
    else:
        return 0

In [None]:
for keyword in keywords_count:
    for key in keyword_dict[keyword]:
        print(keyword, key, get_avg_pos_sentiment(key))

In [None]:
get_avg_neg_sentiment('soggy')

In [None]:
def get_avg_neg_sentiment(keyword):
    sw_s = [sw_.neg_score() for sw_ in list(swn.senti_synsets(keyword,'a'))]
    if len(sw_s)>0:
        return sum(sw_s)/len(sw_s)
    else:
        return 0

In [None]:
[swk.neg_score() for swk in list(swn.senti_synsets('raw','a'))]

In [None]:
list(wn.synsets("best",'a'))[2].definition()

In [None]:
[k.pos_score() for k in swn.senti_synsets('raw','a')]

In [None]:
list(wn.synsets("burnt",'a'))[1].definition()

In [None]:
def get_max_avg_sentiment(keyword):
    sw_s_neg = [sw_.neg_score() for sw_ in list(swn.senti_synsets(keyword,'a'))]
    sw_s_pos = [sw_.pos_score() for sw_ in list(swn.senti_synsets(keyword,'a'))]
    if len(sw_s_pos)>0:
        avg_max = max(sw_s_pos)-max(sw_s_neg)
        return avg_max
    else:
        return 0
#     if len(sw_s)>0:
#         return sum(sw_s)/len(sw_s)
#     else:
#         return 0

In [None]:
count = 0
data_dict = list()
for keyword in keywords_count:
    count += 1
    for key in keyword_dict[keyword]:
        od = OrderedDict()
        od["noun_count"] = keywords_count[keyword] 
        od["noun_adj_count"] = keyword_dict[keyword][key]
        od["noun"] = keyword
        od["adj"] = key 
        od["max_diff_sentiment"]=get_max_avg_sentiment(key)
        data_dict.append(od)
    if count >=30:
        break

In [None]:
import pandas as pd
pd.DataFrame(data_dict).to_excel("/Users/rahulreddy/Downloads/keyword counts and sentiment.xlsx", index=False)