# Libraries

In [1]:
%%capture
! yes y | pip uninstall torchtext
! yes y | pip uninstall torchaudio
! yes y | pip uninstall flask
! yes y | pip uninstall en-core-web-sm
! pip install pattern
! pip install -U pip setuptools wheel
! pip install -U spacy==3.1.0
! python -m spacy download en_core_web_sm
! pip install allennlp allennlp-models

In [2]:
%%capture
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
from copy import deepcopy
from nltk.stem.wordnet import WordNetLemmatizer
import matplotlib.pyplot as plt
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from nltk import pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.corpus import conll2000, wordnet
import re
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG, PAST, PL
import random
import spacy
from spacy import displacy
import inflect
inflect = inflect.engine()

# Load the language model
nlp = spacy.load("en_core_web_sm")

wl = WordNetLemmatizer()
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('conll2000')

SRL_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
CR_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
NER_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz")

# Helping Functions

In [None]:
def import_error():
  from pattern.en import conjugate, lemma, lexeme, PRESENT, SG, PAST, PL

In [4]:
def find_cluster(cluster, words):
  all_coref = []
  for coref in cluster:
    coref_ =  ' '.join(words[coref[0]: coref[1] + 1])
    print(coref_)

In [5]:
def find_arg(words, tags, tag_id):
  founds = []
  found_start = False
  for index, (word, tag) in enumerate(zip(words, tags)):
    if tag_id in tag and not 'R-ARG' in tag:
      found_start = True
      founds.append([word, tag, index])
  real_founds = ' '.join([found[0] for found in founds])
  if len(founds) != 0:
    indexes = [founds[0][2], founds[-1][2]]
  else:
    indexes = []
  case_exists = True if real_founds != '' else False
  return real_founds, case_exists, indexes

In [6]:
def find_sents_bounds(string):
  l_token = -1
  token_bounds = []

  for sent in sent_tokenize(string):
    c_token = string.find(sent)
    if c_token > l_token:
      token_bounds.append([c_token, c_token + len(sent)])
    else:
      c_token = string[l_token: ].find(sent) + l_token

    if c_token < l_token:
      assert False

    if c_token == -1:
      assert False
    l_token = c_token + len(sent)
  return token_bounds

In [7]:
def find_in_sent(doc, span_bound, string, mode='start'):
  index = span_bound
  to_find = doc[index]
  to_find_len = len(to_find)
  finds = list(re.finditer(re.escape(to_find), string))
  finds_1 = finds
  finds_2 = finds
  if len(finds) == 1:
    span = finds[0].span()
  else:
    while True:
      if index + 1 < len(doc):
        index += 1
        to_find_1 = to_find + doc[index]
        to_find_2 = to_find + ' ' + doc[index]

        finds_1 = list(re.finditer(re.escape(to_find_1), string))
        finds_2 = list(re.finditer(re.escape(to_find_2), string))

        if len(finds_1) == 1:
          break
        elif len(finds_1) > 1:
          to_find = to_find + doc[index]
          continue
        elif len(finds_2) == 1:
          break
        elif len(finds_2) > 1:
          to_find = to_find + ' ' + doc[index]
          continue

  if len(finds_1) == 1:
    A = finds_1[0]
  elif len(finds_2) == 1:
    A = finds_2[0]

  if mode == 'start':
    return A.span()[0]
  elif mode == 'end':
    return A.span()[0] + to_find_len

In [8]:
def find(doc, span_bound, mode='start'):
  index = span_bound
  to_find = doc[index]
  to_find_len = len(to_find)
  finds = list(re.finditer(re.escape(to_find), string))
  finds_1 = finds
  finds_2 = finds
  if len(finds) == 1:
    span = finds[0].span()
  else:
    while True:
      if index + 1 < len(doc):
        index += 1
        to_find_1 = to_find + doc[index]
        to_find_2 = to_find + ' ' + doc[index]
        finds_1 = list(re.finditer(re.escape(to_find_1), string))
        finds_2 = list(re.finditer(re.escape(to_find_2), string))

        if len(finds_1) == 1:
          break
        elif len(finds_1) > 1:
          to_find = to_find + doc[index]
          continue
        elif len(finds_2) == 1:
          break
        elif len(finds_2) > 1:
          to_find = to_find + ' ' + doc[index]
          continue
      else:
        break

  if len(finds_1) == 1:
    A = finds_1[0]
  elif len(finds_2) == 1:
    A = finds_2[0]
  elif len(finds_1) > 1:
    A = finds_1[-1]
  elif len(finds_2) > 1:
    A = finds_2[-1]

  if mode == 'start':
    return A.span()[0]
  elif mode == 'end':
    return A.span()[0] + to_find_len

In [9]:
def find_sent(bound):
  for sent_idx, sent_bound in enumerate(sents_bounds):
    if bound[0] >= sent_bound[0] and bound[1] <= sent_bound[1]:
      return sent_idx

In [10]:
def not_possessive(coref):
  possessives = ['mine', 'yours', 'his', 'her', 'ours', 'theirs', '\'s']
  for possessive in possessives:
    if possessive in coref.lower():
      return False
  return True

In [11]:
def find_pronoun(coref):
  coref = coref.lower()
  if coref in ['i', 'you', 'he', 'she', 'we', 'they']:
    return coref
  return None

In [12]:
def find_corresponding_word(char_bound, sent_mapping, mode='start'):
  for k, v in sent_mapping.items():
    if mode == 'start':
      if v[0] == char_bound[0]:
        return k
    elif mode == 'end':
      if v[1] == char_bound[1]:
        return k

In [13]:
def print_ner(sent):
  ner = NER_predictor.predict(sent)
  for word, tag in zip(ner['words'], ner['tags']):
    print(word, tag)

In [14]:
def what_is_coref(all_indexes, coref_bounds):
  if all_indexes[0] != [] and coref_bounds[0] == all_indexes[0][0] and coref_bounds[1] == all_indexes[0][1]:
    print('coref is extent')
  elif all_indexes[1] != [] and coref_bounds[0] == all_indexes[1][0] and coref_bounds[1] == all_indexes[1][1]:
    print('coref is time')
  elif all_indexes[2] != [] and coref_bounds[0] == all_indexes[2][0] and coref_bounds[1] == all_indexes[2][1]:
    print('coref is manner')
  elif all_indexes[3] != [] and coref_bounds[0] == all_indexes[3][0] and coref_bounds[1] == all_indexes[3][1]:
    print('coref is agent')
  elif all_indexes[4] != [] and coref_bounds[0] == all_indexes[4][0] and coref_bounds[1] == all_indexes[4][1]:
    print('coref is patient')
  elif all_indexes[5] != [] and coref_bounds[0] == all_indexes[5][0] and coref_bounds[1] == all_indexes[5][1]:
    print('coref is state')
  elif all_indexes[6] != [] and coref_bounds[0] == all_indexes[6][0] and coref_bounds[1] == all_indexes[6][1]:
    print('coref is cause')
  elif all_indexes[7] != [] and coref_bounds[0] == all_indexes[7][0] and coref_bounds[1] == all_indexes[7][1]:
    print('coref is verb')

In [15]:
def get_sent(*args):
  sent = ''
  for x in args:
    sent += x + ' '
  sent = re.sub('\s\s+', ' ', sent)
  return sent.strip()

In [16]:
def extract_entities(sent):
  tags = NER_predictor.predict(sent)['tags']
  named_entities = []
  start = 0
  end = 0
  for tag_id, tag in enumerate(tags):

    if tag.startswith('U'):
      start, end = tag_id, tag_id
      entity_type = tag.split('-')[-1]
      named_entities.append([entity_type, start, end])

    elif tag.startswith('B'):
      found_start = True
      start = tag_id

    elif tag.startswith('L'):
      end = tag_id
      entity_type = tag.split('-')[-1]
      named_entities.append([entity_type, start, end])

  return named_entities

In [17]:
def is_person(entities):
  if 'PER' in entities:
    return True
  else:
    return False

In [18]:
def find_semantic_ent_type(entities, semantic_roles_indexes):
  semantic_roles_ent_types = [[] for i in range(8)]
  for entity in entities:
    for idx, semantic_roles_index in enumerate(semantic_roles_indexes):
      if semantic_roles_index != []:
        if semantic_roles_index[0] <= entity[1] and semantic_roles_index[1] >= entity[2]:
          semantic_roles_ent_types[idx].append(entity[0])
  return semantic_roles_ent_types

In [19]:
def find_num_ent(entities, semantic_roles_index):
  num_ent = 0
  for entity in entities:
    if semantic_roles_index[0] <= entity[1] and semantic_roles_index[1] >= entity[2]:
      num_ent += 1
  return num_ent

In [20]:
def find_corresponding_coref(indexes, sent_idx):
  all_names, all_indexes = list(), [None]
  for cluster_idx, cluster in enumerate(chars_spans):
    for coref in cluster:
      if indexes != []:
        if coref[1] == sent_idx and coref[2] == indexes[0] and coref[3] == indexes[1]:
          if cluster_idx not in all_indexes:
            if all_indexes[0] is None:
              all_indexes = []
            all_names.append(cluster_names[cluster_idx])
            all_indexes.append(cluster_idx)
  return all_names, all_indexes

In [21]:
def simple_verb(verb):
  verbs = ['am', 'is', 'are', 'was', 'were']
  if verb in verbs:
    return True
  else:
    return False

In [22]:
def non_trivial_corefs(corefs):
  main_pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'its']
  for coref in corefs:
    if coref.lower() not in main_pronouns:
      return coref
  return ''

In [23]:
class Node:

  def __init__(self, sent_num, coref_clusters):
    self.sent_num = sent_num
    self.coref_clusters = coref_clusters
    self.edges = list()

  def set_edge(self, adj_sent, edge_coref_type):
    self.edges.append([adj_sent, edge_coref_type])

  def get_edges(self):
    return self.edges

  def get_clusters(self):
    return self.coref_clusters

  def make_cluster_2_sent(self):
    self.cluster_2_edge = dict()
    for edge in self.edges:
      if edge[1] not in self.cluster_2_edge.keys():
        self.cluster_2_edge[edge[1]] = []
      self.cluster_2_edge[edge[1]].append(edge[0])


  def __str__(self):
    print(self.sent_num, self.coref_clusters)
    return ''

  def __repr__(self):
    print(self.sent_num, self.coref_clusters)
    return ''

In [24]:
def is_simple_verb(verb):
  if verb in ['am', 'is', 'are', 'was', 'were']:
    return True
  else:
    return False

In [25]:
def find_in_sent(doc, span_bound, string, mode='start'):
  index = span_bound
  to_find = doc[index]
  to_find_len = len(to_find)
  finds = list(re.finditer(re.escape(to_find), string))
  finds_1 = finds
  finds_2 = finds
  if len(finds) == 1:
    span = finds[0].span()
  else:
    while True:
      if index + 1 < len(doc):
        index += 1
        to_find_1 = to_find + doc[index]
        to_find_2 = to_find + ' ' + doc[index]

        finds_1 = list(re.finditer(re.escape(to_find_1), string))
        finds_2 = list(re.finditer(re.escape(to_find_2), string))

        if len(finds_1) == 1:
          break
        elif len(finds_1) > 1:
          to_find = to_find + doc[index]
          continue
        elif len(finds_2) == 1:
          break
        elif len(finds_2) > 1:
          to_find = to_find + ' ' + doc[index]
          continue

  if len(finds_1) == 1:
    A = finds_1[0]
  elif len(finds_2) == 1:
    A = finds_2[0]

  if mode == 'start':
    return A.span()[0]
  elif mode == 'end':
    return A.span()[0] + to_find_len

In [26]:
def find_sents_bounds(string):
  l_token = -1
  token_bounds = []
  sents = sent_tokenize(string)
  for sent in sents:
    c_token = string.find(sent)
    if c_token > l_token:
      token_bounds.append([c_token, c_token + len(sent)])
    else:
      c_token = string[l_token: ].find(sent) + l_token
      token_bounds.append([c_token, c_token + len(sent)])

    if c_token < l_token:
      assert False
    if c_token == -1:
      assert False


    l_token = c_token + len(sent)
  return token_bounds

# Example

In [27]:
string = '''
Sheldon Lee Cooper,[4][5] Ph.D., Sc.D.,[6] is a fictional character in the CBS television series The Big Bang Theory and its spinoff series Young Sheldon, portrayed by actors Jim Parsons and Iain Armitage respectively (with Parsons as the latter series' narrator).[7] For his portrayal, Parsons won four Primetime Emmy Awards, a Golden Globe Award, a TCA Award, and two Critics' Choice Television Awards. The character's childhood is the focus of Young Sheldon, in which he grows up in East Texas with his family Missy Cooper, George Cooper Sr., George Cooper Jr., Mary Cooper and his MeeMaw as a child prodigy.

The adult Sheldon is a senior theoretical physicist at the California Institute of Technology (Caltech), and for the first ten seasons of The Big Bang Theory shares an apartment with his colleague and best friend, Leonard Hofstadter (Johnny Galecki); they are also friends and coworkers with Howard Wolowitz (Simon Helberg) and Rajesh Koothrappali (Kunal Nayyar). In season 10, Sheldon moves across the hall with his girlfriend Amy Farrah Fowler (Mayim Bialik), in the former apartment of Leonard's wife Penny (Kaley Cuoco).[8]

He has a genius-level IQ of 187. In The Big Bang Theory, it is said that his and Leonard's IQs add up to 360, meaning Leonard has an IQ of 173. In Young Sheldon Season 7 Episode 7, when Sheldon was studying at home and was commanded to answer the phone, he became annoyed and stated that he is treated like a receptionist at home, despite having an IQ of 187, directly confirming the number. However, he displays a fundamental lack of social skills, a tenuous understanding of humor (always ending with "bazinga"), and difficulty recognizing irony and sarcasm in other people, although he himself often employs them. He exhibits highly idiosyncratic behavior and a general lack of humility, empathy, and toleration. These characteristics provide the majority of the humor involving him, which are credited with making him the show's breakout character.[9][10][11][12] Some viewers have asserted that Sheldon's personality is consistent with autism spectrum disorder (or what used to be classified as Asperger's Syndrome).[11][13] Co-creator Bill Prady has stated that Sheldon's character was neither conceived nor developed with regard to Asperger's,[13] although Parsons has said that in his opinion, Sheldon "couldn't display more facets" of Asperger's syndrome.[14]

The character of Sheldon Cooper was inspired by a computer programmer personally known to series co-creator Bill Prady.[15] He and his friend Leonard Hofstadter are named in honor of actor/producer Sheldon Leonard[16] and Nobel Prize Laureate Leon Cooper.[17] Chuck Lorre originally intended Johnny Galecki to play the role, but Galecki thought he would be "better suited" for the character of Leonard.[18] Lorre said that when Jim Parsons auditioned for the role, he was "so startlingly good" that he was asked to reaudition "to make sure he hadn't gotten lucky".[19]

Sheldon and his fraternal twin sister, Missy, were born on February 26, 1980, at Lawrence Memorial Hospital in Galveston, Texas,[20] and raised in Medford, a fictional small town in East Texas that is a three-hour drive from Dallas, along with their older brother, George Jr., by their mother, Mary Cooper, an overtly devout Baptist, and their father, George Cooper Sr., a football coach.[21] His first word was 'hypotenuse'; he said this at four months old.[22] Sheldon once got his father fired when he told Mr. Hinckley, a store owner, that George was stealing from the cash register.[23] In Young Sheldon, this is retconned: his father is a football coach who was fired from his coaching position in Galveston because he disclosed that other coaches were illegally recruiting players to their school, forcing the family to return to Medford.[24] He does drink, mostly beer, and is a loving father who is trying to understand his intellectually gifted son. The only member of his family to have actively encouraged his work in science was his maternal grandfather, whom he cherished and affectionately called "Pop-Pop", and who died when Sheldon was five years old. Pop-Pop's loss is what caused Sheldon to not like Christmas very much when his Christmas wish to bring Pop-Pop back did not come true. Sheldon's closest relative is his maternal grandmother whom he affectionately calls "Meemaw", and who in turn calls him "Moon Pie".[25] His aunt was also said to have encouraged his work in science by giving him medical equipment, "in case his work in physics failed, he'd have a 'trade' to fall back on". In Young Sheldon, it is shown that his childhood friend Tam was the one who introduced him to non-scientific interests such as comic books and Dungeons & Dragons.
'''


string = re.sub('\[\d*\]','', string)
string = re.sub('\[\w*\]','', string)
string = re.sub('\[note \d+\]','', string)

sents = sent_tokenize(string)
C = CR_predictor.predict(string)
clusters = C['clusters']
doc = C['document']

for cluster in clusters:
  find_cluster(cluster, doc)
  print('--------------------------------')

sents_bounds = find_sents_bounds(string)

sents_semantic_roles = []
for sent in sents:
  semantic_roles = SRL_predictor.predict(sent)
  sents_semantic_roles.append(semantic_roles)


sent_mappings = list()
for sent_idx, sent in enumerate(sents):
  sent_mapping = dict()
  for word_idx, word in enumerate(sents_semantic_roles[sent_idx]['words']):
    if word != '.':
      word_start = find_in_sent(sents_semantic_roles[sent_idx]['words'], word_idx, sent, 'start')
      sent_mapping[word_idx] = [word_start, word_start + len(word)]
  sent_mappings.append(sent_mapping)

chars_spans = []
pronoun = None

for spans in clusters:
  chars_span = list()
  for span in spans:
    start_char = find(doc, span[0], mode='start')
    end_char = find(doc, span[1], mode='end')
    sent_idx = find_sent([start_char, end_char])
    coref = string[start_char: end_char]
    pronoun = find_pronoun(coref) if pronoun == None else pronoun
    start_char_in_sent = start_char - sents_bounds[sent_idx][0]
    end_char_in_sent = end_char - sents_bounds[sent_idx][0]

    start_word_idx_in_sent = find_corresponding_word([start_char_in_sent, end_char_in_sent], sent_mappings[sent_idx], 'start')
    end_word_idx_in_sent = find_corresponding_word([start_char_in_sent, end_char_in_sent], sent_mappings[sent_idx], 'end')

    if not_possessive(coref):
      chars_span.append([coref, sent_idx, start_word_idx_in_sent, end_word_idx_in_sent])
  chars_spans.append(chars_span)

cluster_names = []
for cluster in chars_spans:
  cluster_names.append([coref[0] for coref in cluster])

cluster_2_sent = dict()
for cluster_idx, cluster in enumerate(chars_spans):
  cluster_2_sent[cluster_idx] = set()
  for coref in cluster:
    cluster_2_sent[cluster_idx].add(coref[1])
  cluster_2_sent[cluster_idx] = list(cluster_2_sent[cluster_idx])

# constructing graph
nodes = list()
for sent_idx in range(len(sents)):
  clusters_2_put = list()
  for cluster_idx, cluster in enumerate(chars_spans):

    if sent_idx in cluster_2_sent[cluster_idx]:
      clusters_2_put.append(cluster_idx)
  nodes.append(Node(sent_idx, clusters_2_put))

for node_1_idx, node_1 in enumerate(nodes):
  for node_2_idx, node_2 in enumerate(nodes):
    if node_1_idx != node_2_idx:
      intersects = list(set(node_1.coref_clusters) & set(node_2.coref_clusters))

for node_1_idx, node_1 in enumerate(nodes):
  for node_2_idx, node_2 in enumerate(nodes):
    if node_1_idx != node_2_idx:
      intersects = list(set(node_1.coref_clusters) & set(node_2.coref_clusters))
      if intersects != []:
        for intersect in intersects:
          node_1.set_edge(node_2_idx, intersect)
  node_1.make_cluster_2_sent()

  num_effective_segments = (seq_lengths + self._max_length - 1) // self._max_length


The Big Bang Theory
its
The Big Bang Theory
The Big Bang Theory
the show 's
--------------------------------
Jim Parsons
Parsons
his
Parsons
Parsons
his
Jim Parsons
he
he
he
--------------------------------
its spinoff series Young Sheldon
the latter series '
Young Sheldon
Young Sheldon
--------------------------------
Sheldon Lee Cooper , Ph.D. , Sc . D. ,
The character 's
he
his
his
The adult Sheldon
his
Sheldon
his
He
his
Sheldon
he
he
he
he himself
himself
He
him
him
Sheldon 's
Sheldon 's
Sheldon 's character
Sheldon
The character of Sheldon Cooper
He
his
the role
the role
Sheldon
his
His
he
Sheldon
his
Young Sheldon
his
his intellectually gifted son
his
his
his
he
Sheldon
Sheldon
his
Sheldon 's
his
he
him
His
his
him
his
he
Young Sheldon
his
him
--------------------------------
his colleague and best friend , Leonard Hofstadter ( Johnny Galecki )
Leonard 's
Leonard 's
Leonard
--------------------------------
irony and sarcasm
them
--------------------------------
highly idiosyncra

In [None]:
G = nx.Graph()

for node_idx, node in enumerate(nodes):
  G.add_node(node_idx)

for node_idx, node in enumerate(nodes):
  for edge in node.edges:
    G.add_edge(node_idx, edge[0])

pos = nx.spring_layout(G)
nx.draw(G, pos=pos, with_labels = True)

# More Functions

In [None]:
def find_verbs(pos_tags):
  verbs = list()
  for pos_tag in pos_tags:
    if pos_tag[1].startswith('V'):
      verbs.append(pos_tag)
  return verbs

In [None]:
def del_relatives(sent):
  words = sent.split()
  pos_tags = nltk.pos_tag(words)
  for tag_idx, pos_tag in enumerate(pos_tags):
    if pos_tag[1] == 'WRB' or pos_tag[1] == 'WDT' or pos_tag[1] == 'WP':
      return ' '.join(words[:tag_idx])
  return sent

In [None]:
def clean_question(sent):
  words = sent.split()
  pos_tags = nltk.pos_tag(words)
  sent = re.sub('\(\s*\w*\s*\)', '', sent)
  return sent

In [None]:
def filter_answer(answer):
  try:
    pos_tags = nltk.pos_tag(answer.split())
    if pos_tags[0][1] == 'IN':
      new_answer = ' '.join(answer.split()[1: ])
    else:
      new_answer = answer
  except:
    return answer

  return new_answer

In [None]:
def clean_text(text):
  text = text.strip()
  while(True):
    if text.startswith(',') or text.startswith(';') or text.startswith(':'):
      text = text[1: ]
    else:
      break

  while(True):
    if text.endswith(',') or text.endswith(';') or text.endswith(':'):
      text = text[ :-1]
    else:
      break
  return text.strip()

In [None]:
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG, PAST

In [None]:
def extra_check_person(x, q_wh):
  if x.lower().strip() in ['i', 'she', 'he', 'you', 'we', 'they', 'me', 'him', 'her', 'us']:
    return 'who'
  else:
    return q_wh

In [None]:
def convert_to_object(pronoun):
  pronouns_mapping = {
      'i': 'me',
      'you': 'you',
      'he': 'him',
      'she': 'her',
      'we': 'us',
      'it': 'it',
      'they': 'them'
  }

  if pronoun.lower() in pronouns_mapping.keys():
    return pronouns_mapping[pronoun.lower()]
  else:
    return pronoun

In [None]:
def find_main_verb_pos(verb_idx):

  def find_in_doc(doc, word_text):
    for word_idx, word in enumerate(doc):
      if word_text == word[0]:
        return word[1]

  if doc[verb_idx].text == doc[verb_idx].head.text:
    return pos_tags[verb_idx]
  else:
    return find_in_doc(pos_tags, doc[verb_idx].head.text)

In [None]:
def has_aux(sent, verb_idx):
  pos_tags = nltk.pos_tag(word_tokenize(sent))
  doc_ = nlp(sent)

  for child in doc_[verb_idx].children:
    if 'aux' in child.dep_:
      return child.i
    elif 'advcl' in child.dep_:
      for child_ in child.children:
        if 'aux' in child_.dep_:
          return child_.i

  head = doc_[verb_idx].head
  if 'advcl' == head.dep_:
    for child in head.children:
      if 'aux' in child.dep_:
        return child.i

In [None]:
def get_main_verb_idx(verb_idx, sent):

  def find_in_doc(doc, word_text):
    for word_idx, word in enumerate(doc):
      if word_text == word[0] and (doc_[word_idx].pos_ == 'VERB' or doc_[word_idx].pos_ == 'AUX'):
        return word_idx


  doc_ = nlp(sent)
  aux_idx = has_aux(sent, verb_idx)
  if aux_idx:
    return aux_idx


  if doc_[verb_idx].text == doc_[verb_idx].head.text:
    return verb_idx
  else:
    A = find_in_doc(pos_tags, doc_[verb_idx].head.text)
    if A:
      return A
    else:
      return verb_idx

In [None]:
def get_non_trivial_coref_(cluster_idx):
  cluster_ = chars_spans[cluster_idx]
  corefs = [coref[0] for coref in cluster_]
  non_trivial_corefs = []
  main_pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'its', 'itself', 'himself', 'herself', 'myself', 'yourself', 'themselves']
  for coref in corefs:
    if coref.lower() not in main_pronouns:
      print(coref)
      non_trivial_corefs.append(coref)
  if len(non_trivial_corefs) > 0:
    return sample(non_trivial_corefs)
  else:
    return ''

In [None]:
def get_non_trivial_coref(corefs):
  non_trivial_corefs = []
  main_pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'its', 'itself', 'himself', 'herself', 'myself', 'yourself', 'themselves']
  for coref in corefs:
    if coref.lower() not in main_pronouns:
      non_trivial_corefs.append(coref)
  if len(non_trivial_corefs) > 0:
    return sample(non_trivial_corefs)
  else:
    return ''


def get_trivial_coref(corefs):
  main_pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'its']
  for coref in corefs:
    if coref.lower() in main_pronouns:
      return coref
  return ''

In [None]:
def simple_pronoun(ent):
  pronouns = ['i', 'you', 'it', 'he', 'she', 'we', 'they', 'me', 'him', 'her', 'our', 'them']
  if ent.lower().strip() in pronouns:
    return True
  else:
    return False

In [None]:
def make_agent_in_the_right_form(agent):
  transforms = {
      'him': 'he',
      'her': 'she',
      'them': 'they',
      'me': 'i',
      'us': 'we'
  }
  agent = agent.strip()
  agent_tokens = agent.split(' ')
  try:
    if agent_tokens[0] in ['by', 'with']:
      agent = ' '.join(agent_tokens[1: ])
  except:
    pass

  if agent.lower() in transforms.keys():
    return transforms[agent.lower()]
  else:
    return agent

In [None]:
def filter_aps(word):
  banned_words = ['when', 'where', 'what', 'who', 'from', 'to']
  if word.lower() in banned_words:
    return True
  else:
    return False

In [None]:
def has_aux(sent, verb_idx):
  pos_tags = nltk.pos_tag(word_tokenize(sent))
  doc_ = nlp(sent)
  for child in doc_[verb_idx].children:
    if 'aux' in child.dep_:
      return child.i
    elif 'advcl' in child.dep_:
      for child_ in child.children:
        if 'aux' in child_.dep_:
          return child_.i
  head = doc_[verb_idx].head
  if 'advcl' == doc_[verb_idx].dep_:
    for child in head.children:
      if 'aux' in child.dep_:
        return child.i

  if head.pos_ == 'AUX':
    return head.text

  last_try = find_remained(sent)
  if last_try is not None:
    return last_try[0]

In [None]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.corpus import conll2000

# Ensure necessary NLTK data files are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('conll2000')

In [None]:
def get_aux(sent, verb_idx):
  pos_tags = nltk.pos_tag(word_tokenize(sent))
  doc_ = nlp(sent)
  for child in doc_[verb_idx].children:
    if 'aux' in child.dep_:
      return child.text
    elif 'advcl' in child.dep_:
      for child_ in child.children:
        if 'aux' in child_.dep_:
          return child_.text

  head = doc_[verb_idx].head
  if 'advcl' == doc_[verb_idx].dep_:
    for child in head.children:
      if 'aux' in child.dep_:
        return child.text


  if head.pos_ == 'AUX':
    return head.text


  last_try = find_remained(sent)
  if last_try is not None:
    return last_try[1]

In [48]:
def find_remained(sent):
  doc = nlp(sent)
  pos_tags = nltk.pos_tag(word_tokenize(sent))
  for token_idx, token in enumerate(doc):
    if token.pos_ == 'VERB' and token.tag_ == 'VBN':
      for child in token.children:
        if 'AUX' in child.dep_.upper():
          return child.i, child.text

In [49]:
def print_pos(text):
  print(nltk.pos_tag(word_tokenize(text)))

In [50]:
def find_IN(text):
  tokens = nltk.pos_tag(word_tokenize(text))
  for token in tokens:
    if token[1] == 'IN':
      return token[0]
  return None

In [51]:
def print_srl(text, idx):
  print(SRL_predictor.predict(text)['verbs'][idx]['description'])

In [52]:
A = None

In [53]:
def get_non_trivial_coref(corefs):
  main_pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'its', 'itself', 'himself', 'herself', 'myself', 'yourself', 'themselves']
  for coref in corefs:
    if coref[0].lower() not in main_pronouns:
      return coref[0]
  return ''

In [54]:
A

In [55]:
def filter_specific(srs):
  for sr in srs:
    if sr in ['who', 'self', 'which', 'that']:
      return False
  return True

In [56]:
def past_sr(sent):
  for token in nltk.pos_tag(word_tokenize(sent)):
    if token[1] == 'VBD':
      return True
  return False

In [57]:
def find_extra_verb(sent, verb_idx):
  doc = nlp(sent)
  verb = doc[verb_idx]
  X = False

  for child in verb.children:
    if 'aux' in child.dep_.lower():
      X = True

  if (doc[verb_idx].dep_ == 'xcomp' or doc[verb_idx].dep_ == 'ccomp') and doc[verb_idx].head.pos_ == 'VERB' and (verb.tag_ == 'VBG' or X):
    return doc[verb_idx].head.text
  else:
    return ''

In [58]:
def has_past(sent):
  pos_tags = nltk.pos_tag(word_tokenize(sent))
  for pos_tag in pos_tags:
    if pos_tag[1] == 'VBD' or (pos_tag[1] == 'VBZ' and pos_tag[0] not in ['is', 'are', 'am']):
      return True
  return False

In [59]:
# determines the singularity or plurality of a noun
def is_single(noun):
  if 'and' in noun:
    return False

  if inflect.singular_noun(noun) == False:
    return True
  else:
    return False

In [60]:
def det_wh_based_on_ent(ent):
  if 'PER' in ent:
    return 'who'
  elif 'LOC' in ent:
    return 'where'
  elif 'ORG' in ent:
    return 'what'
  elif 'O' in ent:
    return 'what'
  else:
    return 'what'

In [61]:
def has_noun_ancestor(aswer_ents, doc, idx):
  x = doc[idx]
  last_N_idx = idx
  while(True):
    if x.dep_ == 'ROOT':
      if 'NN' in x.tag_:
        return det_wh_based_on_ent(aswer_ents[x.i])
      else:
        return det_wh_based_on_ent(aswer_ents[last_N_idx])
    else:
      if 'NN' in x.tag_:
        last_N_idx = x.i
    x = x.head

In [62]:
def sample(x):
  return random.sample(x, k=1)[0]

In [63]:
def find_wh_question(ent_types, srl_type, aswer_ents, answer):

  doc = nlp(answer)

  for ent, word in zip(aswer_ents, doc):
    if word.dep_ == 'ROOT':
      if 'PER' in ent:
        return 'who'
      elif 'LOC' in ent:
        return 'where'
      elif 'ORG' in ent:
        return 'what'

  for word_idx, (ent, word) in enumerate(zip(aswer_ents, doc)):
    if 'LOC' in ent or 'PER' in ent or 'ORG' in ent:
      return has_noun_ancestor(aswer_ents, doc, word_idx)

  return 'what'

In [64]:
clusters_person = []
cluster_trivial_coref = []

for cluster_idx, cluster in enumerate(chars_spans):
  is_person = False
  trivial_coref = ''
  for coref in cluster:
    if coref[0].lower() in ['he', 'she', 'her', 'him', 'i', 'my', 'his', 'hers']:
      is_person = True
      if coref[0].lower() in ['he', 'him', 'his']:
        trivial_coref = 'he'
      if coref[0].lower() in ['she', 'her', 'hers']:
        trivial_coref = 'she'
      if coref[0].lower() in ['we', 'us', 'our', 'ours']:
        trivial_coref = 'we'
      if coref[0].lower() in ['it', 'its']:
        trivial_coref = 'it'
      if coref[0].lower() in ['they', 'them', 'their', 'theirs']:
        trivial_coref = 'they'

  if is_person:
    clusters_person.append(True)
  else:
    clusters_person.append(False)

  cluster_trivial_coref.append(trivial_coref)

In [65]:
def give_new_ents(arg, arg_indexes, arg_cluster_idx, ents):
  all_ents = deepcopy(ents)
  if arg_cluster_idx is not None:
    for coref in chars_spans[arg_cluster_idx]:
      if coref[1] == sent_idx and clusters_person[arg_cluster_idx]:
        if coref[2] == coref[3]:
          all_ents[coref[2]] = 'U-PER'
        else:
          all_ents[coref[2]] = 'B-PER'
          for jdx in range(coref[2] + 1, coref[3] + 1):
            all_ents[coref[2]] = 'L-PER'

  if len(all_ents[arg_indexes[0]: arg_indexes[1] + 1]) == 1 and len(arg.split(' ')) > 1:
    Y = all_ents[arg_indexes[0]: arg_indexes[1] + 1] * len(arg.split(' '))
  else:
    Y = all_ents[arg_indexes[0]: arg_indexes[1] + 1]

  return Y

In [66]:
def get_corefs(arg, arg_indexes, coref_indexes, cluster_idx, mode):
  words = word_tokenize(arg)

  main_indexes = [coref_indexes[0] - arg_indexes[0], coref_indexes[1] - arg_indexes[0]]

  if mode == 'non_trivial':
    new_coref = get_non_trivial_coref([coref[0] for coref in chars_spans[cluster_idx]])
  if mode == 'trivial':
    new_coref = get_trivial_coref([coref[0] for coref in chars_spans[cluster_idx]])

  coref_words = word_tokenize(new_coref)

  new_words = words[0: main_indexes[0] + 1] + coref_words + words[main_indexes[1]: ]

  new_words = ' '.join(new_words)


  return new_words

In [67]:
def not_suitable(text):
  list_ = ['to', 'when', 'where', 'who', 'the', 'what', 'from', 'in', 'at']
  return True if text.lower() in list_ else False

In [68]:
# Define a chunk grammar to identify verb phrases (VP)
grammar = r"""
    VP: {<MD>?<VB.*><RB|RP|DT|IN|PRP|JJ>*<VB.*|VBN|VBG>*}
"""

def extract_verb_auxiliaries(sentence, target_verb):
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    # Get the part-of-speech tags for the tokens
    pos_tags = pos_tag(tokens)

    # Parse the sentence to extract verb phrases (VP)
    cp = nltk.RegexpParser(grammar)
    tree = cp.parse(pos_tags)

    # Extract auxiliary verbs associated with the target verb
    aux_verbs = []
    for subtree in tree:
        if isinstance(subtree, Tree) and subtree.label() == 'VP':
            words = [word for word, tag in subtree.leaves()]
            if target_verb in words:
                aux_verbs = [word for word, tag in subtree.leaves() if tag in ('MD', 'VBZ', 'VBD', 'VBG', 'VBN', 'VBP') and word != target_verb]
                break

    return ' '.join(aux_verbs)

# Example usage
sentence = "He would have been performing it"
target_verb = "performing"
auxiliary_verbs = extract_verb_auxiliaries(sentence, target_verb)
print("Auxiliary Verbs for '{}':".format(target_verb), auxiliary_verbs)

Auxiliary Verbs for 'performing': would been


In [69]:
def get_sentence_root(sentence):
    doc = nlp(sentence)
    root = None

    for token in doc:
        if token.dep_ == 'ROOT':
            root = token
            break
    return root

# Unit Test Agent & Patient

In [70]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

True

In [71]:
def get_verb_lemma(verb):
  try:
    verb = lemma(verb)
  except:
    import_error()
    verb = lemma(verb)
  return verb

def plural_pronoun(pronoun):
    if pronoun.lower() in ['i', 'me', 'she', 'her', 'he', 'him', 'it']:
      return False
    elif pronoun.lower() in ['we', 'us', 'they', 'them']:
      return True


def is_plural(sent, word_idx):

    tokenized_word = nltk.word_tokenize(sent)
    pos_tags = nltk.pos_tag(tokenized_word)
    word = pos_tags[word_idx][0]
    word_pos_tag = pos_tags[word_idx][1]

    if word_pos_tag in ['NNS', 'NNPS']:
        return True
    elif word_pos_tag in ['NN', 'NNP']:
        return False
    elif word_pos_tag in ['PRP']:
        return plural_pronoun(word)

In [72]:
is_plural('troubles want us', 0)

True

In [73]:
# need to be working further, discriminate does and do
def get_q_ext(words, verb_indexes):
  pos_tags = nltk.pos_tag(words)

  try:
    verb_pos = pos_tags[verb_indexes[0]][1]
  except:
    verb_pos = 'VB'

  if verb_pos == 'VBD' or verb_pos == 'VBN':
    q_ext = 'did'
  elif verb_pos == 'VB' or verb_pos == 'VBZ' or verb_pos == 'VBG' or verb_pos == 'VBP' or verb_pos == 'MD':
    if inflect.singular_noun(verb) == False:
        q_ext = 'do'
    else:
        q_ext = 'does'
  else:
    q_ext = 'did'


  return q_ext

In [74]:
def make_final_text(string):
  final_text = ''
  prev_word = ''
  apos_count = 0
  words = string.split()
  for word in words:
    if word == '\"':
      apos_count += 1
    if word.startswith('\'') or word.startswith('n\'t') or word.startswith(',') or word.startswith('.') or \
      word.startswith(':') or word.startswith('?') or word.startswith('!') or word.startswith(';'):
      final_text += word
    elif word.startswith('-') or prev_word.startswith('-'):
      final_text += word
    elif prev_word.startswith('(') or word.startswith(')'):
      final_text += word
    elif prev_word == '\"' and (apos_count + 1) % 2 == 0:
      final_text += word
    elif word == '\"' and apos_count % 2 == 0:
      final_text += word
    else:
      final_text += ' ' + word
    prev_word = word
  return final_text.strip()

In [75]:
idx_2_cluster = dict()
for idx, cluster_name in enumerate(cluster_names):
  idx_2_cluster[idx] = list(set(cluster_name))

In [76]:
def get_pronoun(pns, simple=True):
  found_simple = False
  non_simple = []
  if simple:
    for pn in pns:
      if pn in pronouns:
        found_simple = True
        return pn.lower()

  elif not simple:
    for pn in pns:
      if not pn in pronouns:
        non_simple.append(pn)
    return random.sample(non_simple, k=1)[0].lower()

  if simple and not found_simple:
    return random.sample(pns, k=1)[0].lower()

In [77]:
def get_aux(doc, verb_idx):

    token = doc[verb_idx]
    aux_verbs = []
    # Collect auxiliary verbs
    for child in token.children:
        if child.dep_ in ['aux', 'auxpass']:
            aux_verbs.append(child.text)
        elif child.dep_ == 'neg' and 'n\'t' in child.text:
            aux_verbs[-1] += child.text
        elif child.dep_ == 'neg' and child.text == 'not':
            aux_verbs.append(child.text)

    aux_verbs.append(token.text)

    # for child in token.children:
    #     if child.dep_ == 'xcomp' and child.pos_ == 'VERB':
    #         print('ttt', child.text)
    #         aux_verbs.append('to ' + child.text)
    # print(aux_verbs)
    return ' '.join(aux_verbs[ :-1])

In [78]:
sent = '''Chuck Lorre originally intended Johnny Galecki to play the role, but Galecki thought he would be "better suited" for the character of Leonard.'''
sent_doc = nlp(sent)

In [79]:
get_aux(sent_doc, 3)

''

In [80]:
agent_list = []
Qs = []

Agent_Patient_Qs = []
nidx = 0

for sent_idx, sent in enumerate(sents):
  sent = sents[sent_idx]
  sent_semantic = sents_semantic_roles[sent_idx]
  words = sent_semantic['words']
  entities = extract_entities(sent)
  all_ents = NER_predictor.predict(sent)['tags']
  sent_doc = nlp(sent)

  for idx in range(len(sent_semantic['verbs'])):
    semantic_tags = sent_semantic['verbs'][idx]['tags']
    pos_tags = nltk.pos_tag(words)

    # Find all semantics
    ext, ext_exists, ext_indexes = find_arg(words, semantic_tags, 'EXT')
    time, time_exists, time_indexes = find_arg(words, semantic_tags, 'TMP')
    manner, manner_exists, manner_indexes  = find_arg(words, semantic_tags, 'MNR')
    agent, agent_exists, agent_indexes = find_arg(words, semantic_tags, 'ARG0')
    patient, patient_exists, patient_indexes = find_arg(words, semantic_tags, 'ARG1')
    state, state_exists, state_indexes = find_arg(words, semantic_tags, 'ARG2')
    arg3, arg3_exists, arg3_indexes = find_arg(words, semantic_tags, 'ARG3')
    arg4, arg4_exists, arg4_indexes = find_arg(words, semantic_tags, 'ARG4')
    cause, cause_exists, cause_indexes = find_arg(words, semantic_tags, 'CAU')
    loc, loc_exists, loc_indexes = find_arg(words, semantic_tags, 'LOC')
    adv, adv_exists, adv_indexes = find_arg(words, semantic_tags, 'ADV')
    verb, verb_exists, verb_indexes = find_arg(words, semantic_tags, '-V')

    # find clusters
    ext_corefs, ext_cluster_idx = find_corresponding_coref(ext_indexes, sent_idx)
    time_corefs, time_cluster_idx = find_corresponding_coref(time_indexes, sent_idx)
    manner_corefs, manner_cluster_idx = find_corresponding_coref(manner_indexes, sent_idx)
    agent_corefs, agent_cluster_idx = find_corresponding_coref(agent_indexes, sent_idx)
    patient_corefs, patient_cluster_idx = find_corresponding_coref(patient_indexes, sent_idx)
    state_corefs, state_cluster_idx = find_corresponding_coref(state_indexes, sent_idx)
    cause_corefs, cause_cluster_idx = find_corresponding_coref(cause_indexes, sent_idx)
    loc_corefs, loc_cluster_idx = find_corresponding_coref(loc_indexes, sent_idx)

    # if there is no verb continue
    if len(verb_indexes) == 0:
      continue

    verb_index = verb_indexes[0]


    semantic_role_ent_type = find_semantic_ent_type(entities,
                             [ext_indexes,
                              time_indexes,
                              manner_indexes,
                              agent_indexes,
                              patient_indexes,
                              state_indexes,
                              cause_indexes,
                              loc_indexes])


    ext_ent_types, time_ent_types, manner_ent_types, agent_ent_types, \
    patient_ent_types, state_ent_types, cause_ent_types, loc_ent_types = semantic_role_ent_type

    # main pronouns
    pronouns = ['i', 'you', 'he', 'she', 'we', 'they']
    to_be_s = ['am', 'are', 'is', 'was', 'were']

    extra_verb = find_extra_verb(sent, verb_index)
    verb_pos = sent_doc[verb_index].tag_
    old_verb = verb

    q_ext = ''

    if agent_exists and patient_exists:

      agent_index = agent_indexes[0]
      patient_index = patient_indexes[0]

      # get auxiliary verbs
      aux = get_aux(sent_doc, verb_index)

      if aux == 'to':
        continue

      if verb_pos == 'VBD':
        q_ext = 'did'
        verb = get_verb_lemma(verb)
      elif verb_pos in ['VB', 'VBP', 'VBZ']:
        if is_plural(sent, agent_index):
          q_ext = 'do'
          verb = get_verb_lemma(verb)
        else:
          q_ext = 'does'
          verb = get_verb_lemma(verb)
      elif verb_pos == 'VBG' and not any([to_be in verb for to_be in to_be_s]):
        continue
      elif not verb_pos.startswith('V'):
        continue

      q_ext = aux if aux != '' else q_ext


      # question filtering
      ent_threshold = 0
      agent_ent_num = find_num_ent(entities, agent_indexes)
      patient_ent_num = find_num_ent(entities, patient_indexes)
      ent_sum = agent_ent_num + patient_ent_num

      max_q_thresh = 500


      q_wh = find_wh_question(patient_ent_types, 'patient', all_ents[patient_indexes[0]: patient_indexes[1] + 1], patient)
      q_wh = extra_check_person(patient, q_wh)


      # fix answer in the right way
      if simple_pronoun(patient) and patient_cluster_idx is not None:
        patient = get_non_trivial_coref(patient_corefs)

      if patient == '':
        continue


      Y = give_new_ents(patient, patient_indexes, patient_cluster_idx[0], all_ents)

      q_wh = find_wh_question(patient_ent_types, 'patient', Y, patient)
      q_wh = extra_check_person(patient, q_wh)

      root_word = get_sentence_root(patient)
      if root_word.pos_ == 'VERB':
        q_wh = 'what'

      question = get_sent(q_wh, q_ext, agent, verb, state, arg3, arg4, time, loc, '?')


      q_detail = {
        'bef': get_sent(q_wh, q_ext),
        'agent': agent,
        'after': get_sent(extra_verb, verb, state, arg3, arg4, time, loc, '?')
      }


      answer = patient
      if answer == 'this' or answer == 'that' or answer == 'when' or answer == 'where':
        continue
      question, answer = clean_text(question), clean_text(answer)

      Qs.append({
          'q_wh': q_wh,
          'q_ext': q_ext,
          'agent': agent,
          'verb': verb,
          'state': state,
          'arg3': arg3,
          'arg4': arg4,
          'time': time,
          'loc': loc,
          'answer': answer,
          'agent': agent,
          'agent_cluster_idx': agent_cluster_idx,
      })

      question = make_final_text(question)
      answer = make_final_text(answer)

      QA = get_sent(question, answer)
      print(nidx, '-', QA)

      agent_list.append([agent, agent_cluster_idx])

      Agent_Patient_Qs.append({
          'question': question,
          'answer': answer,
          'agent': agent,
          'q_ext': q_ext,
          'patient': patient,
          'sent': sent,
          'old_verb': old_verb,
          'verb': verb,
          'aux': aux,
          'root': root_word.text,
          'root_pos': root_word.pos_,
          'pt': semantic_role_ent_type,
          'ents': entities,
          'extra_verb': extra_verb,
          'verb_indexes': verb_indexes,
          'verb_pos': verb_pos,
          'q_wh': q_wh,
          'q_ext': q_ext,
          'agent_cluster': agent_cluster_idx,
          'patient_cluster': patient_cluster_idx,
          'agent_in_question': True,
          'q_detail': q_detail,
          'sent_idx': sent_idx
      })

      nidx += 1



0 - what did Parsons win For his portrayal? four Primetime Emmy Awards, a Golden Globe Award, a TCA Award, and two Critics' Choice Television Awards
1 - what does He have? a genius-level IQ of 187
2 - what does Leonard have? an IQ of 173
3 - what did he state when Sheldon was studying at home and was commanded to answer the phone In Young Sheldon Season 7 Episode 7? that he is treated like a receptionist at home, despite having an IQ of 187, directly confirming the number
4 - what does he display? a fundamental lack of social skills, a tenuous understanding of humor (always ending with "bazinga"), and difficulty recognizing irony and sarcasm in other people
5 - what does he himself employ often? irony and sarcasm
6 - what does He exhibit? highly idiosyncratic behavior and a general lack of humility, empathy, and toleration
7 - what does These characteristics provide? the majority of the humor involving him, which are credited with making him the show's breakout character
8 - what have 

In [None]:
idx_2_list = dict()
for Q in Qs:
  cluster_id = Q['agent_cluster_idx'][0]
  if cluster_id not in idx_2_list.keys():
    idx_2_list[cluster_id] = []
  idx_2_list[cluster_id].append(Q)

In [82]:
qidx = 1
for k, v in idx_2_list.items():
  for turn_idx, turn in enumerate(v):
    agent_cluster_idx = turn['agent_cluster_idx'][0]
    if agent_cluster_idx is not None:
      pns = idx_2_cluster[agent_cluster_idx]
      if turn_idx == 0:
        pn = get_pronoun(pns, simple=False)
      else:
        pn = get_pronoun(pns, simple=True)
    else:
      pn = turn['agent']

    question = get_sent(turn['q_wh'],
                        turn['q_ext'],
                        pn,
                        turn['verb'],
                        turn['state'],
                        turn['arg3'],
                        turn['arg4'],
                        # turn['time'],
                        turn['loc'],
                        '?')
    answer = turn['answer']
    question, answer = clean_text(question), clean_text(answer)
    question = make_final_text(question)
    answer = make_final_text(answer)
    print(str(qidx) + '-', question, answer)
    qidx += 1

1- what did parsons win For his portrayal? four Primetime Emmy Awards, a Golden Globe Award, a TCA Award, and two Critics' Choice Television Awards
2- what has he said? that in his opinion, Sheldon "couldn't display more facets" of Asperger's syndrome
3- what does the role have? a genius-level IQ of 187
4- what did he state In Young Sheldon Season 7 Episode 7? that he is treated like a receptionist at home, despite having an IQ of 187, directly confirming the number
5- what does he display? a fundamental lack of social skills, a tenuous understanding of humor (always ending with "bazinga"), and difficulty recognizing irony and sarcasm in other people
6- what does he employ? irony and sarcasm
7- what does he exhibit? highly idiosyncratic behavior and a general lack of humility, empathy, and toleration
8- what couldn't he display? more facets "of Asperger's syndrome
9- what did he get? his father fired
10- what did he cherish? his maternal grandfather
11- what did he call "Pop-Pop"? his 