-
Notifications
You must be signed in to change notification settings - Fork 0
/
semanticQG.py
201 lines (159 loc) · 9.76 KB
/
semanticQG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from allennlp.predictors.predictor import Predictor
import spacy
import contractions
import os
# Method that performs the input, segmenatation into sentences, and matching of the sentences against the pre-defined
# rules to generate questions and output them to a file:
def generateQuestions(predictor, nlp, transcriptFileName):
sentences = [] # Stores all the sentences in the transcript
transcript = open(transcriptFileName + ".txt") # Reads in the transcript
output = open(transcriptFileName + 'SemanticQuestions.txt','w') # File to which output questions (and the sentences used) are written
text = transcript.read() # Read all the text into one string
text = contractions.fix(text) # Remove contractions (these can mess up the SRL)
doc = nlp(text) # Used to split the text into sentences
# Intilialise some variables for coverage statistics
totalSentences = 0
numSentences = 0
numQuestions = 0
# Loop through the sentences filtering out blank lines and start/end labels
for sent in doc.sents:
totalSentences += 1
sent = str(sent).replace("\n"," ")
sent = str(sent).replace("[MUSIC]","")
sent = str(sent).replace("[SOUND]","")
sentences.append(sent)
# Loop through the sentences and run the SRL Model on them
for sent in sentences:
questions = [] # Stores the generated questions for this sentence
result = predictor.predict(sent) # Run the SRL Model
tagDict = {} # Will store the SRL tags and phrases
# Loop through the output and reformat it to better meet the programs needs
# Format of TagDict = {verb1: [{SRL Tag: phrase}, {SRL Tag: phrase},...], verb2: [{SRL Tag: phrase}, {SRL Tag: phrase},...],...}
for verbs in result['verbs']:
verb = verbs['verb']
description = verbs['description']
tagDict[verb] = []
for i in description:
if i == '[':
tag = description[description.find('[')+1:description.find(':')]
word = description[description.find(':')+1:description.find(']')]
description = description[description.find(']')+1::]
arr = tagDict[verb]
arr.append({tag:word})
tagDict[verb] = arr
# Loop through each list of the tagDict (each list corresponds to a different verb)
# For each list extract the keys (SRL Tag) and values (Phrases) into parallel lists
for list in tagDict.values():
keys = []
values = []
# Append each of these phrases and keys into a parallel list
for dict in list:
for key in dict.keys():
keys.append(key)
values.append(dict[key])
# Check if the keys in the sentences match any of the QG rules
#################### WHY RULES #########################
if (('V' in keys) and ('ARG0' in keys) and ('ARGM-PRP' in keys) and ('ARG1' in keys) and ('ARGM-MOD' in keys)):
verb = values[keys.index('V')]
arg0 = values[keys.index('ARG0')]
arg1 = values[keys.index('ARG1')]
argMOD = values[keys.index('ARGM-MOD')]
argPRP = values[keys.index('ARGM-PRP')]
question = "Why" + argMOD + arg0 + verb + arg1 + '?' # PRP (Purpose) is the answer
questions.append("Question: " + question + '\n')
if (('V' in keys) and ('ARG0' in keys) and ('ARGM-CAU' in keys) and ('ARG1' in keys) and ('ARGM-MOD' in keys)):
verb = values[keys.index('V')]
arg0 = values[keys.index('ARG0')]
arg1 = values[keys.index('ARG1')]
argMOD = values[keys.index('ARGM-MOD')]
argCAU = values[keys.index('ARGM-CAU')]
question = "Why" + argMOD + arg0 + verb + arg1 + '?' # CAU (CAU) is the answer
questions.append("Question: " + question + '\n')
#################### WHAT RULES #########################
if (('V' in keys) and ('ARG0' in keys) and ('ARGM-MOD' in keys) and ('ARGM-PRP' in keys) and ('ARG1' in keys)):
verb = values[keys.index('V')]
arg0 = values[keys.index('ARG0')]
arg1 = values[keys.index('ARG1')]
argPRP = values[keys.index('ARGM-PRP')]
argMOD = values[keys.index('ARGM-MOD')]
question = "What" + argMOD + arg0 + verb + argPRP + '?' # arg1 is the answer
questions.append("Question: " + question + '\n')
if (('V' in keys) and ('ARG0' in keys) and ('ARGM-MOD' in keys) and ('ARGM-CAU' in keys) and ('ARG1' in keys)):
verb = values[keys.index('V')]
arg0 = values[keys.index('ARG0')]
arg1 = values[keys.index('ARG1')]
argCAU = values[keys.index('ARGM-CAU')]
argMOD = values[keys.index('ARGM-MOD')]
question = "What" + argMOD + arg0 + verb + argCAU + '?' # arg1 is the answer
questions.append("Question: " + question + '\n')
if (('V' in keys) and ('ARG0' in keys) and ('ARGM-EXT' in keys) and ('ARG1' in keys) and ('ARGM-MOD' in keys)):
verb = values[keys.index('V')]
arg0 = values[keys.index('ARG0')]
arg1 = values[keys.index('ARG1')]
argEXT = values[keys.index('ARGM-EXT')]
argMOD = values[keys.index('ARGM-MOD')]
question = "To what extent" + argMOD + arg0 + verb + arg1 + '?' # argEXT (Extent) is the answer
questions.append("Question: " + question + '\n')
#################### HOW RULES #########################
if (('V' in keys) and ('ARG0' in keys) and ('ARGM-MNR' in keys) and ('ARG1' in keys) and ('ARGM-MOD' in keys)):
verb = values[keys.index('V')]
arg0 = values[keys.index('ARG0')]
arg1 = values[keys.index('ARG1')]
argMNR = values[keys.index('ARGM-MNR')]
argMOD = values[keys.index('ARGM-MOD')]
question = "How" + argMOD + arg0 + verb + arg1 +'?' # MNR (Manner) is the answer
questions.append("Question: " + question + '\n')
if (('V' in keys) and ('ARG0' in keys) and ('ARGM-ADV' in keys) and ('ARG1' in keys) and ('ARGM-MOD' in keys)):
verb = values[keys.index('V')]
arg0 = values[keys.index('ARG0')]
arg1 = values[keys.index('ARG1')]
argADV = values[keys.index('ARGM-ADV')]
argMOD = values[keys.index('ARGM-MOD')]
question = "How" + argMOD + arg0 + verb + arg1 +'?' # ADV (Adverb) is the answer
questions.append("Question: " + question + '\n')
#################### WHEN RULES #########################
if (('V' in keys) and ('ARG0' in keys) and ('ARGM-TMP' in keys) and ('ARG1' in keys) and ('ARGM-MOD' in keys)):
argTMP = values[keys.index('ARGM-TMP')].strip()
# Extra logic to prevent the generation of "when" questions if the argTMP phrase is a certain value
if argTMP.lower() != "then" and argTMP.lower() != "now" and argTMP.lower() != "before" and argTMP.lower() != "after" and argTMP.lower() != "again" and argTMP.lower() != "eventually":
verb = values[keys.index('V')]
arg0 = values[keys.index('ARG0')]
arg1 = values[keys.index('ARG1')]
argMOD = values[keys.index('ARGM-MOD')]
question = "When" + argMOD + arg0 + verb + arg1 +'?' # TMP (Temporal) is the answer
questions.append("Question: " + question + '\n')
# If there are questions, write them to the output file as well as the sentence from which they were derived
if len(questions) != 0:
numSentences += 1
output.write("--------------------------" + '\n')
sent = sent.strip()
output.write("Sentence: "+ sent + '\n')
for question in questions:
numQuestions +=1
output.write(question)
# Display stats about the generated questions
print("Total Sentences:", totalSentences)
print("Generated" , numQuestions, "questions from", numSentences, "of the sentences.")
# Close the input and output file
transcript.close()
output.close()
if __name__ == "__main__":
# Initialise the SRL predictor and SpaCy models
print("Loading SRL and Spacy Models...")
predictor = Predictor.from_path("structured-prediction-srl-bert.2020.12.15.tar.gz")
#predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
nlp = spacy.load("en_core_web_sm")
userInput = ""
# Input Loop: Ask user for file name to generate questions from
# contains basic check to see if file exists
while True:
userInput = input('Enter Content File Name (type "stop" to close the program): ')
if userInput.upper() == "STOP":
break
if os.path.isfile(userInput + ".txt"):
print("Generating Questions...")
generateQuestions(predictor, nlp, userInput)
print("Questions Saved to: " + userInput + "SemanticQuestions.txt")
else:
print("File Not Found.\nPlease enter the name of a valid .txt file (excluding the .txt extension).")
continue