In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd
data_path = '/content/drive/My Drive/AmbiCoref/AmbiCoref/Data/'

# Data

## Gendered names and noun phrases 


*   20 gendered phrases: 
  * partially from https://github.com/uclanlp/corefBias/blob/master/WinoBias/wino/extra_gendered_words.txt
*   20 gendered names: 
  * https://www.ssa.gov/oact/babynames/decades/names1970s.html
  * most popular names from the past 5 decades; 4 names sampled for each decade.







In [None]:
names_df = pd.read_csv(data_path + 'NPs/Names.csv')
NPs_df = pd.read_csv(data_path + 'NPs/Gendered_NPs.csv')
all_names = names_df['Male'].to_list() + names_df['Female'].to_list()

In [None]:
df = pd.read_csv(data_path + 'NPs/NP_mixed.csv')

In [None]:
f_nps = df['Female'].tolist()
m_nps = df['Male'].tolist()

In [None]:
len(f_nps), len(m_nps)

(40, 40)

##  permutations

### (first time) Generate NP pairs

In [None]:
!pip install iteration-utilities

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from iteration_utilities import random_combination
import random
import itertools
import numpy as np

In [None]:
num_np_pairs = 26

In [None]:
def get_N3(n1, n2, nps): 
  # given the two NPs, get a NP for N3 position so that it's different from n1 or n2
  n3 = random.choice(nps)
  while n3 == n1 or n3 == n2:
    n3 = random.choice(nps)
  return n3

In [None]:
count = 0
all_2f = set()
nps_3f = []
while count < num_np_pairs:
  f1, f2 = random_combination(f_nps, r=2)
  while (f1, f2) in all_2f:
    f1, f2 = random_combination(f_nps, r=2)
  all_2f.add((f1,f2))
  f3 = get_N3(f1, f2, f_nps)
  nps_3f.append((f1,f2,f3))
  nps_3f.append((f2,f1,f3))
  count += 2
  if count >= num_np_pairs:
    break

In [None]:
count = 0
all_2m = set()
nps_3m = []
while count < num_np_pairs:
  m1, m2 = random_combination(m_nps, r=2)
  while (m1, m2) in all_2m:
    m1, m2 = random_combination(m_nps, r=2)
  all_2m.add((m1, m2))
  m3 = get_N3(m1, m2, m_nps)
  nps_3m.append((m1,m2,m3))
  nps_3m.append((m2,m1,m3))
  count += 2
  if count >= num_np_pairs:
    break

In [None]:
count = 0
nps_2f_m = []

while count < num_np_pairs:
  f1, f2 = random_combination(f_nps, r=2)
  while (f1, f2) in all_2f:
    f1, f2 = random_combination(f_nps, r=2)
  all_2f.add((f1,f2))
  m3 = random.choice(m_nps)
  nps_2f_m.append((f1,f2,m3))
  nps_2f_m.append((f2,f1,m3))
  count += 2
  if count >= num_np_pairs:
    break


In [None]:
count = 0
nps_2m_f = []

while count < num_np_pairs:
  m1, m2 = random_combination(m_nps, r=2)
  while (m1, m2) in all_2m:
    m1, m2 = random_combination(m_nps, r=2)
  all_2m.add((m1, m2))
  f3 = random.choice(f_nps)
  nps_2m_f.append((m1,m2,f3))
  nps_2m_f.append((m2,m1,f3))
  count += 2
  if count >= num_np_pairs:
    break


In [None]:
count = 0
nps_2m = []

while count < num_np_pairs:
  m1, m2 = random_combination(m_nps, r=2)
  while (m1, m2) in all_2m:
    m1, m2 = random_combination(m_nps, r=2)
  all_2m.add((m1, m2))
  nps_2m.append((m1,m2))
  nps_2m.append((m2,m1))
  count += 1
  if count >= num_np_pairs:
    break

In [None]:
count = 0
nps_2f = []

while count < num_np_pairs:
  f1, f2 = random_combination(f_nps, r=2)
  while (f1, f2) in all_2f:
    f1, f2 = random_combination(f_nps, r=2)
  all_2f.add((f1, f2))
  nps_2f.append((f1, f2))
  nps_2f.append((f2, f1))
  count += 1
  if count >= num_np_pairs:
    break

### (first time) Save the generated NP pairs - For keeping track

In [None]:
# for the purpose of keeping track - making sure we are using the same sets of NP pairs during sentence generation
np_pairs_df = pd.DataFrame(list(zip(nps_3m, nps_3f, nps_2m_f, nps_2f_m, nps_2m, nps_2f)),
                           columns = ['nps_3m', 'nps_3f', 'nps_2m_f', 'nps_2f_m', 'nps_2m', 'nps_2f'])
np_pairs_df.to_csv(data_path+'NPs/All_generated_NP_pairs.csv')

In [None]:
nps_3m_df = pd.DataFrame(nps_3m, columns=['m1','m2','m3'])
nps_3f_df = pd.DataFrame(nps_3f, columns=['f1','f2','f3'])
nps_2m_f_df = pd.DataFrame(nps_2m_f, columns=['m1','m2','f'])
nps_2f_m_df = pd.DataFrame(nps_2f_m, columns=['f1','f2','m'])
nps_2m_df = pd.DataFrame(nps_2m, columns=['m1','m2'])
nps_2f_df = pd.DataFrame(nps_2f, columns=['f1','f2'])

In [None]:
nps_dfs = [nps_3m_df, nps_3f_df, nps_2m_f_df, nps_2f_m_df, nps_2m_df, nps_2f_df]
nps_dfs_names = ['nps_3m', 'nps_3f', 'nps_2m_f', 'nps_2f_m', 'nps_2m', 'nps_2f']
for i in range(len(nps_dfs)):
  nps_dfs[i].to_csv(data_path+'NPs/'+nps_dfs_names[i]+'.csv')

### Retrieve the NP pairs

In [None]:
# nps_3m_df = pd.read_csv(data_path+'NPs/nps_3m.csv')
# nps_3f_df = pd.read_csv(data_path+'NPs/nps_3f.csv')
# nps_2m_f_df = pd.read_csv(data_path+'NPs/nps_2m_f.csv')
# nps_2f_m_df = pd.read_csv(data_path+'NPs/nps_2f_m.csv')
# nps_2m_df = pd.read_csv(data_path+'NPs/nps_2m.csv')
# nps_2f_df = pd.read_csv(data_path+'NPs/nps_2f.csv')


## Verbs


In [None]:
# Type ECO
ECO_all = pd.read_csv(data_path+"verb_phrases/ECO.csv")
amuse_verbs = ECO_all['bored'].dropna()
see_verbs = ECO_all['saw'].dropna()

In [None]:
# Type ECS
ECS_all = pd.read_csv(data_path+"verb_phrases/ECS.csv")
admire_verbs = ECS_all['liked'].dropna()
meet_verbs = ECS_all['met-with'].dropna()

In [None]:
#Type IC
IC_all = pd.read_csv(data_path+"verb_phrases/IC.csv")
call_verbs = IC_all['called'].dropna()
reasons_amb = IC_all['reason_ambig'].dropna()
reasons_unamb = IC_all['reason_unambig'].dropna()

In [None]:
# Type TOP
TOP_all = pd.read_csv(data_path+"verb_phrases/TOP.csv")

def male_to_female(text):
  text1 = text.replace(" he "," she ")
  text2 = text1.replace(" him", " her")
  text3 = text2.replace(" his", " her")
  return text3

# Type ECO

In [None]:
def grammar(np_pair):
  np_pair_g = []
  for i in range(len(np_pair)):
    if np_pair[i] in all_names:
      np_pair_g.append(np_pair[i])
    elif i == 0:
      np_pair_g.append("The " + np_pair[i])
    else:
      np_pair_g.append("the " + np_pair[i])
  return np_pair_g

##Unambiguous ECO-1: [Name A] told [Name B] that [pronoun] [saw] [Name C].


In [None]:
import random
import itertools

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/ECO-1_unambiguous.txt","w+")

for s in see_verbs:
  #f1 told f2 that she saw f3 (f3!=f1, f3!=f2, f1!=f2)
  for p in nps_3f:
    f1, f2, f3 = grammar(p)
    file_a.write(f1 + " told " + f2 + " that she " + s + " " + f3  +  ".\n") 

  #f1 told f2 that she saw m
  for p in nps_2f_m:
      f1, f2, m = grammar(p)
      file_a.write(f1 + " told " + f2 + " that she " + s + " " + m  +  ".\n") 
      
  #m1 told m2 that he saw m3 (m3!=m1, m3!=m2, m1!=m2)
  for p in nps_3m:
    m1, m2, m3 = grammar(p)
    file_a.write(m1 + " told " + m2 + " that he " + s + " " + m3  +  ".\n") 
    
  #m1 told m2 that she saw f
  for p in nps_2m_f:
    m1, m2, f = grammar(p)
    file_a.write(m1 + " told " + m2 + " that he " + s + " " + f  +  ".\n")

file_a.close()


## Ambiguous ECO-1: [Name A] told [Name B] that [pronoun] [bored] [Name C].

In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/ECO-1_ambiguous.txt","w+")

for s in amuse_verbs:
  #f1 told f2 that she bored f3 (f3!=f1, f3!=f2, f1!=f2)
  for p in nps_3f:
    f1,f2,f3 = grammar(p)
    file_a.write(f1 + " told " + f2 + " that she " + s + " " + f3  +  ".\n") 


  #f1 told f2 that she bored m
  for p in nps_2f_m:
      f1,f2,m = grammar(p)
      file_a.write(f1 + " told " + f2 + " that she " + s + " " + m  +  ".\n") 

  #m1 told m2 that he bored m3 (m3!=m1, m3!=m2, m1!=m2)
  for p in nps_3m:
    m1,m2,m3 = grammar(p)
    file_a.write(m1 + " told " + m2 + " that he " + s + " " + m3  +  ".\n") 

  #m1 told m2 that she bored f
  for p in nps_2m_f:
    m1,m2,f = grammar(p)
    file_a.write(m1 + " told " + m2 + " that he " + s + " " + f  +  ".\n") 

file_a.close()

##Unambiguous ECO-2: [Name A] told [Name B] that [pronoun] [saw] the client.


In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/ECO-2_unambiguous_.txt","w+")

for s in see_verbs:
  #f1 told f2 that she saw <their colleagues>
  for p in nps_2f:
      f1,f2 = grammar(p) 
      file_a.write(f1 + " told " + f2 + " that she " + s + " the client.\n") 

  #m1 told m2 that he saw <their colleagues>
  for p in nps_2m:
    m1,m2 = grammar(p)
    file_a.write(m1 + " told " + m2 + " that he " + s + " the client.\n") 

file_a.close()


## Ambiguous ECO-2: [Name A] told [Name B] that [pronoun] [bored] the client.

In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/ECO-2_ambiguous.txt","w+")

for v in amuse_verbs:
  #f1 told f2 that she bored <their colleagues>
  for p in nps_2f:
    f1,f2 = grammar(p)
    file_a.write(f1 + " told " + f2 + " that she " + v + " the client.\n") 

  #m1 told m2 that he bored <their colleagues>
  for p in nps_2m:
    m1,m2 = grammar(p)
    file_a.write(m1 + " told " + m2 + " that he " + v + " the client.\n") 


file_a.close()

# Type ECS

## Unambiguous ECS-1: [Name A] [told] [Name B] that [Name C] [met with] [pronoun].


In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/ECS-1_unambiguous.txt","w+")


for s in meet_verbs:
  #f1 told f2 that f3 met with her (f3!=f1, f3!=f2, f1!=f2)
  for p in nps_3f:
    f1,f2,f3 = grammar(p)
    file_a.write(f1 + " told " + f2 + " that " + f3 + " " + s + " her.\n") 


  #f1 told f2 that m met with her
  for p  in nps_2f_m:
    f1,f2,m = grammar(p)
    file_a.write(f1 + " told " + f2 + " that " + m + " " + s + " her.\n") 
    
  #m1 told m2 that m3 met with him (m3!=m1, m3!=m2, m1!=m2)
  for p in nps_3m:
    m1,m2,m3 = grammar(p)
    file_a.write(m1 + " told " + m2 + " that " + m3 + " " + s + " him.\n")
    
  #m1 told m2 that f met with him
  for p in nps_2m_f:
    m1,m2,f = grammar(p)
    file_a.write(m1 + " told " + m2 + " that " + f + " " + s + " him.\n")
    
file_a.close()

## Ambiguous ECS-1: [Name A] [told] [Name B] that [Name C] [liked] [pronoun].


In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/ECS-1_ambiguous.txt","w+")

for s in admire_verbs:
  #f1 told f2 that f3 liked her (f3!=f1, f3!=f2, f1!=f2)
  for p in nps_3f:
    f1,f2,f3 = grammar(p)
    file_a.write(f1 + " told " + f2 + " that " + f3 + " " + s + " her.\n") 
    
  #f1 told f2 that m liked her
  for p in nps_2f_m:
    f1,f2,m = grammar(p)
    file_a.write(f1 + " told " + f2 + " that " + m + " " + s + " her.\n") 
    
  #m1 told m2 that m3 liked him (m3!=m1, m3!=m2, m1!=m2)
  for p in nps_3m:
    f1,f2,m = grammar(p)
    file_a.write(m1 + " told " + m2 + " that " + m3 + " " + s + " him.\n") 
    
  #m1 told m2 that f liked him
  for p in nps_2m_f:
    m1,m2,f = grammar(p)
    file_a.write(m1 + " told " + m2 + " that " + f + " " + s + " him.\n")
    
file_a.close()

## Unambiguous ECS-2: [Name A] [told] [Name B] that the client [met with] [pronoun].


In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/ECS-2_unambiguous.txt","w+")

for s in meet_verbs:

  #f1 told f2 that the client met with her (f3!=f1, f3!=f2, f1!=f2)
  for p in nps_2f:
    f1, f2 = grammar(p)
    file_a.write(f1 + " told " + f2 + " that the client " + s + " her.\n")

  #m1 told m2 that the client met with him
  for p in nps_2m:
    m1, m2 = grammar(p)
    file_a.write(m1 + " told " + m2 + " that the client " + s + " him.\n") 

file_a.close()

## Ambiguous ECS-2: [Name A] [told] [Name B] that the client [liked] [pronoun].


In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/ECS-2_ambiguous_active.txt","w+")

for s in admire_verbs:

  #f1 told f2 that the client met with her (f3!=f1, f3!=f2, f1!=f2)
  for p in nps_2f:
    f1, f2 = grammar(p)
    file_a.write(f1 + " told " + f2 + " that the client " + s + " her.\n") 
    
  #m1 told m2 that the client met with him
  for p in nps_2m:
    m1, m2 = grammar(p)
    file_a.write(m1 + " told " + m2 + " that the client " + s + " him.\n") 
    
file_a.close()

# Type IC

##Unambiguous

In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/IC_unambiguous.txt","w+")

for i in range(len(call_verbs)):
  c = call_verbs[i]
  c_p = call_passive[i]
  for r in reasons_unamb: 
    for p in nps_2m:
      m1, m2 = grammar(p)
      file_a.write(m1 + " " + c + " " + m2 + " because he " + r + ".\n")

    for p in nps_2f:
      f1, f2 = grammar(p)
      file_a.write(f1 + " " + c + " " + f2 + " because she " + r + ".\n")

file_a.close()

##Ambiguous

In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/data_ealc/sentences/IC_ambiguous.txt","w+")

for i in range(len(call_verbs)):
  c = call_verbs[i]
  
  for r in reasons_amb: 
    for p in nps_2m:
      m1, m2 = grammar(p)
      file_a.write(m1 + " " + c + " " + m2 + " because he " + r + ".\n")
      
    for p in nps_2f:
      f1, f2 = grammar(p)
      file_a.write(f1 + " " + c + " " + f2 + " because she " + r + ".\n")
      
file_a.close()

# Type TOP

##Unambiguous

In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/Data/sentences/TOP_unambiguous.txt","w+")

for i,row in TOP_all.iterrows():
  verb_m = row['event_unambig']
  verb_f = male_to_female(verb_m)

  for p in nps_2f:
    f1,f2 = grammar(p)
    file_a.write(f1 + " " + row['passed'] + " " + f2 + " " + row['DO'] + " " + row['prep'] + " she " + verb_f + ".\n")
    
  for p in nps_2m:
    m1,m2 = grammar(p)
    file_a.write(m1 + " "+ row['passed'] + " " + m2 + " " + row['DO'] + " " + row['prep'] + " he " + verb_m + ".\n")
    
file_a.close()


##Ambiguous

In [None]:
import random

file_a = open(r"/content/drive/My Drive/AmbiCoref/Data/sentences/TOP_ambiguous.txt","w+")

for i,row in TOP_all.iterrows():
  verb_m = row['event_ambig']
  verb_f = male_to_female(verb_m)

  for p in nps_2f:
    f1,f2 = grammar(p)
    file_a.write(f1 + " "+ row['passed'] + " " + f2 + " " + row['DO'] + " " + row['prep'] + " she " + verb_f + ".\n")
    
  for p in nps_2m:
    m1,m2 = grammar(p)
    file_a.write(m1 + " "+ row['passed'] + " " + m2 + " " + row['DO'] + " " + row['prep'] + " he " + verb_m + ".\n")
      
file_a.close()