In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


# Allen NLP install & import

In [None]:
pip install allennlp==2.1.0 allennlp-models==2.1.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

In [None]:
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")

# Setup

In [None]:
import pandas as pd
col_names = ['Category','%Name1','%Name2','%Missing','%Both','%Other','Model']
df_all = pd.DataFrame(columns = col_names)

In [None]:
data_path = "/content/drive/My Drive/AmbiCoref/AmbiCoref/Data"

# Run Model

https://github.com/allenai/allennlp-models/blob/main/allennlp_models/coref/models/coref.py

## Type ECO

In [None]:
# return values:
# 1: pos1
# 2: pos2
# -1: missing
# 3: both
# 4: other

def run_coref(s):
  output = predictor.predict(document=s[:-1])
  words = s.split()
  word_index_that = words.index("that")

  name1 = (0,0)
  if words[0] == 'The':
    name1 = (0,1)

  name2 = (word_index_that-1, word_index_that-1)
  if words[word_index_that-2]=='the':
    name2 = (word_index_that-2, word_index_that-1)

  pronoun = (word_index_that+1, word_index_that+1)


  for c in output['clusters']:
    ref = set()
    target = False
    
    for span in c:
      if tuple(span) == pronoun:
        target = True
      else:
        ref.add(tuple(span))

    if not target:
      continue

    if name1 in ref and name2 not in ref:
      return 1
    elif name2 in ref and name1 not in ref:
      return 2
    elif name1 in ref and name2 in ref:
      return 3
    else:
      if len(ref) > 1:
        return 4
  
  return -1

### unambiguous ECO-1 active

In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/ECO-1_unambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

-------------
%Name1 = 0.575423429781228
%Name2 = 0.12041284403669725
%Missing = 0.2837861679604799
%Both =  0.02037755822159492
%Other = 0.0


In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "ECO-1_unambiguous" 
df_temp['Model'] = "Allen_NLP"
df_all = pd.concat([df_all, df_temp])

### ambiguous ECO-1 active

In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/ECO-1_ambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

-------------
%Name1 = 0.5665137614678899
%Name2 = 0.18216302046577276
%Missing = 0.21912491178546226
%Both =  0.03219830628087509
%Other = 0.0


In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "ECO-1_ambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

### unambiguous ECO-2 active

In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/ECO-2_unambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "ECO-2_unambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

### ambiguous ECO-2 active

In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/ECO-2_ambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "ECO-2_ambiguous"
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

## Type ECS

In [None]:
def run_coref(s):
  output = predictor.predict(document=s[:-1])
  words = s.split()
  word_index_that = words.index("that")

  name1 = (0,0)
  if words[0] == 'The':
    name1 = (0,1)

  name2 = (word_index_that-1, word_index_that-1)
  if words[word_index_that-2]=='the':
    name2 = (word_index_that-2, word_index_that-1)

  len_sentence = len(words)
  pronoun = (len_sentence-1, len_sentence-1)


  for c in output['clusters']:
    ref = set()
    target = False
    
    for span in c:
      if tuple(span) == pronoun:
        target = True
      else:
        ref.add(tuple(span))

    if not target:
      continue

    if name1 in ref and name2 not in ref:
      return 1
    elif name2 in ref and name1 not in ref:
      return 2
    elif name1 in ref and name2 in ref:
      return 3
    else:
      if len(ref) > 1:
        print(ref)
        return 4
  
  return -1

### unambiguous ECS-1 active


In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/ECS-1_unambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

-------------
%Name1 = 0.2737030411449016
%Name2 = 0.28600178890876565
%Missing = 0.43067978533094814
%Both =  0.009615384615384616
%Other = 0.0


In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "ECS-1_unambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])


### ambiguous ECS-1 active


In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/ECS-1_ambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "ECS-1_ambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

### unambiguous ECS-2 active


In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/ECS-2_unambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "ECS-2_unambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

### ambiguous ECS-2 active


In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/ECS-2_ambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "ECS-2_ambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

## Type IC

In [None]:
# return values:
# 1: pos1
# 2: pos2
# -1: missing
# 3: both
# 4: other

def run_coref(s):
  output = predictor.predict(document=s[:-1])
  words = s.split()
  word_index_that = words.index("because")

  name1 = (0,0)
  if words[0] == 'The':
    name1 = (0,1)

  name2 = (word_index_that-1, word_index_that-1)
  if words[word_index_that-2]=='the':
    name2 = (word_index_that-2, word_index_that-1)

  pronoun = (word_index_that+1, word_index_that+1)


  for c in output['clusters']:
    ref = set()
    target = False
    
    for span in c:
      if tuple(span) == pronoun:
        target = True
      else:
        ref.add(tuple(span))

    if not target:
      continue

    if name1 in ref and name2 not in ref:
      return 1
    elif name2 in ref and name1 not in ref:
      return 2
    elif name1 in ref and name2 in ref:
      return 3
    else:
      if len(ref) > 1:
        print(ref)
        return 4
  
  return -1

### unambiguous active

In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/IC_unambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "IC_unambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp]) 

### ambiguous active

In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/IC_ambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "IC_ambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

## Type TOP

In [None]:
# return values:
# 1: pos1
# 2: pos2
# -1: missing
# 3: both
# 4: other

def run_coref(s):
  output = predictor.predict(document=s[:-1])
  words = s.split()
  pronoun_index = -1
  if "she" in words:
    pronoun_index = words.index("she")
  else:
    pronoun_index = words.index("he")
  pronoun = (pronoun_index, pronoun_index)
  

  output = predictor.predict(document=s[:-1])
  words = s.split()
  pronoun_index = -1
  if "she" in words:
    pronoun_index = words.index("she")
  else:
    pronoun_index = words.index("he")
  pronoun = (pronoun_index, pronoun_index)


  name1 = (0,0)
  name2_index_start = 2 
  if words[0] == 'The':
    name1 = (0,1)
    name2_index_start = 3

  if "by" in words:
    name2_index_start = words.index("by") + 1

  if words[name2_index_start]=='the':
    name2 = (name2_index_start, name2_index_start+1)
  else:
    name2 = (name2_index_start, name2_index_start)

  for c in output['clusters']:
    ref = set()
    target = False
    
    for span in c:
      if tuple(span) == pronoun:
        target = True
      else:
        ref.add(tuple(span))

    if not target:
      continue

    if name1 in ref and name2 not in ref:
      return 1
    elif name2 in ref and name1 not in ref:
      return 2
    elif name1 in ref and name2 in ref:
      return 3
    else:
      if len(ref) > 1:
        print(ref)
        return 4
  
  return -1

###unambiguous active

In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/TOP_unambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

-------------
%Name1 = 0.20845204178537513
%Name2 = 0.636633428300095
%Missing = 0.13437796771130103
%Both =  0.018399810066476733
%Other = 0.002136752136752137


In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "TOP_unambiguous"
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

###ambiguous active

In [None]:
file = open(r"/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/sentences/TOP_ambiguous.txt","r")
sentences = file.readlines()

In [None]:
count_pos1 = 0
count_pos2 = 0
count_total = 0
count_other = 0
count_both = 0

In [None]:
for i, s in enumerate(sentences):
  
  count_total = count_total + 1
  result = run_coref(s)
  if result == 1:
    count_pos1 = count_pos1 + 1
  elif result == 2:
    count_pos2 = count_pos2 + 1
  elif result == 3:
    count_both = count_both + 1
  elif result == 4:
    count_other = count_other + 1


#### Results

In [None]:
name1_perc = count_pos1 / count_total
name2_perc = count_pos2 / count_total
missing_perc = (count_total -count_pos1 - count_pos2 - count_other - count_both) / count_total
both_perc = count_both / count_total
other_perc = count_other / count_total

print("-------------")
print("%Name1 = " + str(name1_perc))
print("%Name2 = " + str(name2_perc))
print("%Missing = " + str(missing_perc))
print("%Both =  " + str(both_perc))
print("%Other = " + str(other_perc))

In [None]:
result_temp = [(name1_perc, name2_perc, missing_perc, both_perc, other_perc)]
df_temp = pd.DataFrame(result_temp, columns=['%Name1','%Name2','%Missing','%Both','%Other'] )
df_temp['Category'] = "TOP_ambiguous" 
df_temp['Model'] = "Allen_NLP" 
df_all = pd.concat([df_all, df_temp])

# Final output

In [None]:
df_all.to_csv(data_path+"Results/results_Allen_NLP.csv")