In [None]:
import sys
!{sys.executable} -m pip install spacy
!{sys.executable} -m pip install textacy

In [None]:
import csv
import spacy
import textacy
nlp = spacy.load('en_core_web_sm')

def isNegative(token,sentence):
    """
    token,tokens.doc -> bool
    Takes a token representing a word and a doc representing a sentence
    Returns whether the word is negated in the sentence
    """
    for word in sentence:
        if word.dep_ == 'neg': #if word is a negation
            if word.head==token: #check if it negates the desired word
                return True
    return False

def inClause(token,sentence):
    """
    token,tokens.doc -> bool
    Takes a token representing a word and a doc representing a sentence
    Returns whether the word is part of a subordinate clause rather than the main clause
    However, clauses subordinated by certain verbs of knowing or asking are included with main clauses
    """
    clausetypes = ['advcl','relcl','csubj','csubjpass','pcomp','xcomp','acl','aux']
    knowing = ['know','understand','see','get']
    asking = ['wonder','ask','inquire','demand']
    while token.dep_ != 'ROOT':
        if token.dep_ in clausetypes:
            return True
        if token.dep_ == 'ccomp' or token.dep_ == 'conj':
            if token.head.lemma_ in knowing: #if it is a verb of knowing, only use negatives
                return not isNegative(token.head,sentence)
            if token.head.lemma_ in asking:
                return False
            return True
        token=token.head
    return token.pos_ != 'VERB' and token.pos_ != 'AUX' #phrase head should be a verb

def isSubject(token):
    """
    token -> bool
    Takes a token representing a word
    Returns whether the word is part of the noun phrase representing the subject of a sentence
    """
    while token.dep_ != 'ROOT': #iterate until you get to the main verb
        if token.dep_ == 'nsubj': #if you get to the head of the subject noun phrase first, then the original word was part of this phrase
            return True
        token = token.head
    return False

def inPhrase(token1, token2):
    """
    token, token ->
    """
    while token1.dep_ != 'ROOT':
        if token1==token2:
            return True
        token1=token1.head
    if token1==token2:
        return True
    return False
    
def hasAux(token, sentence):
    """
    token, tokens.doc -> bool
    Takes a word and a sentences
    Returns whether the word is modified by an auxiliary in the sentence (or is one)
    """
    if token.pos_ == 'AUX':
        return True
    for word in sentence:
        if inPhrase(word,token):
            temp = word
            while temp.dep_ != 'ROOT':
                if temp.pos_ == 'AUX':
                    return True
                temp = temp.head
    return False

def isYNQuestion(sentence):
    """
    tokens.doc -> bool
    Takes a string representing a sentence
    Returns whether the sentence is a yes or no question
    """
    if sentence[0].pos_ == 'AUX': #yn questions start with an auxiliary verb
        if isSubject(sentence[1]): #it is followed by the subject
            if sentence[0].dep_=='aux':
                return sentence[0].head.dep_ == 'ROOT' #eliminates some rare adverbial clauses
            return True
    return False

def isWHQuestion(sentence):
    """
    tokens.doc -> bool
    Takes a string representing a sentence
    Returns whether the sentence is a wh-question, i.e. who-what-where-when-why
    """
    whwords = ['who', 'what', 'where', 'when', 'why', 'how', 'which', 'whose', 'whence', 'whither', 'whom']
    for word in sentence:
        if word.lemma_ in whwords: #for each wh-word, see if it is in the main clause
            if inClause(word,sentence):
                continue #if not, continue looking for wh-words
            return True
    return False

def isQuestion(sentence):
    """
    str -> bool
    Takes a string representing a sentence
    Returns whether the sentence is a question
    """
    text = nlp(sentence) #split up sentence into words
    if len(text)<3:
        return False
    return isYNQuestion(text) or isWHQuestion(text)

def extractQuestions(text):
    """
    .csv -> list
    Takes a csv file with some sentences
    Returns the subset of sentences that are questions as a list
    """
    sentences = list(text)[0] #convert csv into list
    questions = [] #output list of questions
    for element in sentences:
        if isQuestion(element): #check if the sentence is a question
            questions.append(element) #if so, add it to the list
    return questions

In [None]:
import pandas as pd
import re
from cleantext import clean

def extractQuestionsStr(sentences):
    """
    list -> list
    Takes list with strings representing sentences
    Returns the subset of sentences that are questions as a list
    """
    questions = [] #output list of questions
    for element in sentences:
        if isQuestion(element): #check if the sentence is a question
            questions.append(element) #if so, add it to the list
    return questions

raw_data = pd.read_csv("GOOGLE_REVIEWS_Walmart_Guelph_1_2019-09-30.csv")
raw_data = raw_data.dropna(subset=['Review Text'])
raw_data = raw_data.reset_index(drop=True)
reviews = raw_data['Review Text']
with open('output.csv',mode='w') as out_file:
    writer = csv.writer(out_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for i in range(len(reviews)):
        raw_sentences = re.split('[ ]*[.?!;\n]+[ \n]*',reviews[i]) #split text up into sentences, remove whitespace
        cleaned_sentences = map(lambda s: clean(s.encode('ascii',errors='ignore').decode()), raw_sentences) #remove non-ascii characters
        questions = extractQuestionsStr(cleaned_sentences)
        print(reviews[i])
        print(questions)
        if len(questions)>0:
            writer.writerow(questions)

In [21]:
def test(token):
    print(token.text)
    temp = token
    temp = temp.head
    print(token.text)

text = nlp("My wife was picking up some water softener salt and asked for assistance loading it into the car and was told by the cashier that she didn't know who could help")
for token in text:
    print(token.text, token.tag_, token.pos_, token.head.text, token.dep_)
isQuestion("My wife was picking up some water softener salt and asked for assistance loading it into the car and was told by the cashier that she didn't know who could help")

My PRP$ PRON wife poss
wife NN NOUN picking nsubj
was VBD AUX picking aux
picking VBG VERB picking ROOT
up RP ADP picking prt
some DT DET salt det
water NN NOUN softener compound
softener NN NOUN salt compound
salt NN NOUN picking dobj
and CC CCONJ picking cc
asked VBD VERB picking conj
for IN ADP asked prep
assistance NN NOUN for pobj
loading VBG VERB assistance acl
it PRP PRON loading dobj
into IN ADP loading prep
the DT DET car det
car NN NOUN into pobj
and CC CCONJ picking cc
was VBD AUX told auxpass
told VBN VERB picking conj
by IN ADP told agent
the DT DET cashier det
cashier NN NOUN by pobj
that IN SCONJ know mark
she PRP PRON know nsubj
did VBD AUX know aux
n't RB PART know neg
know VB VERB told ccomp
who WP PRON help nsubj
could MD AUX help aux
help VB VERB help ROOT


False

In [None]:
with open("input.txt") as f_obj:
    raw_text = f_obj.read()
    raw_sentences = re.split('[ ]*[.?!;\n]+[ \n]*',raw_text)
    cleaned_sentences = map(lambda s: s.encode('ascii',errors='ignore').decode(), raw_sentences)
    questions = extractQuestionsStr(cleaned_sentences)
    print(questions)