In [1]:
!pip install --upgrade pip
!pip install spacy --quiet
!python -m spacy download en_core_web_sm --quiet

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import spacy
from spacy.lang.en.examples import sentences
import pandas as pd

In [3]:
nlp = spacy.load('en_core_web_sm')

'Apple is looking at buying U.K. startup for $1 billion'

In [5]:
from pprint import pprint as pp

def split_tokens(text):
    doc = nlp(text)
    cols = ['text', 'lemma_', 'pos_', 'tag_', 'dep_', 'is_alpha', 'is_stop']
    df = pd.DataFrame(columns=cols)

    for token in doc:
        # print(f'{token.text} {token.lemma_} {token.pos_} {token.tag_} {token.dep_} {token.is_alpha} {token.is_stop}')
        df = df.append(pd.Series(
            [token.text,
            token.lemma_,
            token.pos_, 
            token.tag_, 
            token.dep_, 
            token.is_alpha, 
            token.is_stop
            ], index=cols),ignore_index=True)
    return df

split_tokens(sentences[0])

Unnamed: 0,text,lemma_,pos_,tag_,dep_,is_alpha,is_stop
0,Apple,Apple,PROPN,NNP,nsubj,True,False
1,is,be,AUX,VBZ,aux,True,True
2,looking,look,VERB,VBG,ROOT,True,False
3,at,at,ADP,IN,prep,True,True
4,buying,buy,VERB,VBG,pcomp,True,False
5,U.K.,U.K.,PROPN,NNP,dobj,False,False
6,startup,startup,VERB,VBD,dep,True,False
7,for,for,ADP,IN,prep,True,True
8,$,$,SYM,$,quantmod,False,False
9,1,1,NUM,CD,compound,False,False


In [6]:
comment="Mustafa, The datasets are interesting, but I am not sure how you will connect it to the road conditions. You also shared a very specific case which again I am not sure how can be obtained from the data. What you can do instead is to look how traffic accidents and violations relates. Do violations require a police presence? Then you can estimate when police was present and it was not. and that can be a reasoning to connect to accidents. Let me know what you think. Dmitry"

split_tokens(comment)

Unnamed: 0,text,lemma_,pos_,tag_,dep_,is_alpha,is_stop
0,Mustafa,Mustafa,PROPN,NNP,nsubj,True,False
1,",",",",PUNCT,",",punct,False,False
2,The,the,DET,DT,det,True,True
3,datasets,dataset,NOUN,NNS,nsubj,True,False
4,are,be,AUX,VBP,ROOT,True,True
...,...,...,...,...,...,...,...
92,what,what,PRON,WP,dobj,True,True
93,you,you,PRON,PRP,nsubj,True,True
94,think,think,VERB,VBP,ccomp,True,False
95,.,.,PUNCT,.,punct,False,False


In [7]:
text = "Could include more examples in your lab report."
split_tokens(text)

Unnamed: 0,text,lemma_,pos_,tag_,dep_,is_alpha,is_stop
0,Could,could,AUX,MD,aux,True,True
1,include,include,VERB,VB,ROOT,True,False
2,more,more,ADJ,JJR,amod,True,True
3,examples,example,NOUN,NNS,dobj,True,False
4,in,in,ADP,IN,prep,True,True
5,your,your,PRON,PRP$,poss,True,True
6,lab,lab,NOUN,NN,compound,True,False
7,report,report,NOUN,NN,pobj,True,False
8,.,.,PUNCT,.,punct,False,False


## Detecting Suggestions
If a word was tagged as a modal (MD) and the word directly following this was tagged as either of the verb types VB, VBZ, or VBP, then the review would be classified as a suggestion.

In [8]:
from spacy.matcher import Matcher

In [9]:
# Coarse-grained part-of-speech from the Universal POS tag set.(https://universaldependencies.org/u/pos/)
# The tag VERB covers PTB tags VB, VBP, VBZ, VBD, VBG, VBN, except for auxiliary verb uses of be, have, do, and get.
# (Auxiliary verbs and modals are AUX and the infinitive to is PART.)
def get_suggestions(text):
    action_lemma = ['try', 'think', 'get']
    pattern = [{"POS":"AUX", "OP":"+"}, {"POS":"VERB"}]
    pattern2 = [{"POS":"VERB", "LEMMA":{"IN": action_lemma}}]

    matcher = Matcher(nlp.vocab)
    matcher.add("AUXILLARY", [pattern, pattern2], greedy="LONGEST")
    doc = nlp(text)
    matches = matcher(doc)
    print(len(matches))

    for match in matches:
        print(match, doc[match[1]:match[2]])

In [60]:
get_suggestions(comment)

4
(14195895055726482716, 36, 39) can be obtained
(14195895055726482716, 14, 16) will connect
(14195895055726482716, 67, 69) can estimate
(14195895055726482716, 94, 95) think


In [10]:
t = "you have to try multiplying the numbers. Think of edge cases and formualte an equation to get the answer."
get_suggestions(t)

3
(14195895055726482716, 3, 4) try
(14195895055726482716, 8, 9) Think
(14195895055726482716, 17, 18) get
