# Gender Encryption

### Installation

https://github.com/zalandoresearch/flair

https://github.com/huggingface/neuralcoref

In [None]:
#!pip install flair
#!pip uninstall spacy
#!pip install -U spacy==2.1.0
##!python -m spacy download en
#!python -m spacy download en_core_web_lg
##!python -m spacy download en_core_web_md
#!pip install neuralcoref --no-binary neuralcoref

### Creating Resources

In [2]:
male_pronouns   = ["he",  "him", "his$", "his", "himself"]
female_pronouns = ["she", "her", "her$", "hers", "herself"]
neutral_pronouns= ["zie", "zim", "zir", "zis", "zieself"]
merged_pronouns = ["he/she", "him/her", "his/her", "his/hers", "himself/herself"]

In [3]:
gender_pronouns_dict = {}
gender_honorific_dict = {}

In [4]:
for (g1,g2,g3,g4) in zip(male_pronouns, female_pronouns, neutral_pronouns, merged_pronouns):
    element = {"male": g1, "female":g2, "neutral":g3, "merged":g4}
    gender_pronouns_dict[g1] = gender_pronouns_dict[g2] = gender_pronouns_dict[g3] = gender_pronouns_dict[g4] =element

In [5]:
male_hons   =  ["Mr.", "Mr", "Md.", "Md", "Sir", "Lord", "Mister"]
female_hons =  ["Ms.", "Ms", "Mst.", "Mst", "Madam", "Lady", "Miss"]
neutral_hons = ["Mx.", "Mx", "Mx.", "Mx", "Sir/Madam", "Lord/Lady", "Mister/Miss"]
married_hons = ["Mrs.", "Mrs", "Mst.", "Mst", "Madam", "Lady", "Mis'ess"]
merged_hons =  ["Mr./Ms.", "Mr/Ms", "Md./Mst.", "Md/Mst", "Sir/Madam", "Lord/Lady", "Mister/Miss"]

In [6]:
for (h1,h2,h3,h4,h5) in zip(male_hons, female_hons, neutral_hons, married_hons, merged_hons):
    element = {"male": h1, "female":h2, "neutral":h3, "married_fem":h4, "merged":h5}
    gender_honorific_dict[h1] = gender_honorific_dict[h2] = gender_honorific_dict[h3] = \
    gender_honorific_dict[h4] = gender_honorific_dict[h5] = element

### Imports

In [7]:
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single

In [8]:
from keras.models import load_model

Using TensorFlow backend.


In [9]:
import spacy
import neuralcoref

### Loading Essentials

In [10]:
import json

with open('char2idx.json', 'r') as fp:
    char2idx = json.load(fp)
    
with open('idx2char.json', 'r') as fp:
    idx2char = json.load(fp)

In [11]:
model = load_model('char_rnn_hsc_model_0.h5')

2019-11-08 10:14:03,133 From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

2019-11-08 10:14:03,172 From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

2019-11-08 10:14:03,196 From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

2019-11-08 10:14:03,330 From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

2019-11-08 10:14:03,337 From c:\users\hp\appdata\local\programs\python

In [12]:
nlp = spacy.load('en_core_web_lg') # en_core_web_sm #en_core_web_md
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x14835964358>

In [13]:
tagger_ner = SequenceTagger.load('ner')
tagger_pos = SequenceTagger.load('pos')

2019-11-08 10:14:17,137 loading file C:\Users\Hp\.flair\models\en-ner-conll03-v0.4.pt
2019-11-08 10:14:36,227 loading file C:\Users\Hp\.flair\models\en-pos-ontonotes-v0.2.pt


### Name2Gender

In [68]:
import numpy as np
from keras.preprocessing import sequence

# Converts a name into vector
def name2vectorTest(name):
    name = name.lower()
    new_name = ""
    for char in name:
      if char in char2idx:
        new_name += char
    chars = list(new_name)
    vector = [ char2idx[c] for c in chars ]
    return np.array(vector)

# Converts names to fixed size tensor
def names2tensorTest(names, maxlen=25):
    namelist = [name2vectorTest(name) for name in names]
    return sequence.pad_sequences(np.array(namelist), maxlen=maxlen)  # root of all troubles

def name2gender(name):
  result = model.predict_classes(np.array(names2tensorTest([name.lower()])))[0][0]
  if result:
    return "male"
  else:
    return "female"
  
def isMale(name):
  result = model.predict_classes(np.array(names2tensorTest([name.lower()])))[0][0]
  return result

In [69]:
def store(name, name_found, Name2Key, Key2Name, num_keys): 
    
    if name_found not in Name2Key:
        #global num_keys
        num_keys+=1
        key = "PER_"+str(num_keys)
        gender = name2gender(name_found)
        alias = None
        element = {"name": name, "key": key, "gender":gender, "alias":alias, "is_alias": False, "alias_to": None}
        Name2Key[name_found] = element
        Key2Name[key] = element
    
    if name not in Name2Key:
        element_alias = {"name": name, "is_alias": True, "alias_to": name_found}
        Name2Key[name] = element_alias
        Name2Key[name_found]["alias"] = name
        
    return Name2Key[name_found]["key"], num_keys

### Input Sentence

In [70]:
s = '''Biden said he would beat the President "like a drum" and that Trump "knows it" because he has spent "a lot of money to make sure I'm not" the nominee.'''

In [26]:
s = "Morshed Khan told newsmen that 'BNP is a massive party with huge popularity and public acceptance.'"# This party is now running its operations over Skype. He (Tarique) is operating from London over Skype and that too with only selected leaders. It has turned into the ‘Bangladesh Nationalist Skype Party’. This is painful. I believe the next generation must assume leadership within the party — that would be my recommendation.”"77

In [46]:
s = "Mr Masum Hasan is not a doctor. Masum is an engineer."

### POS-NER

In [47]:
s = ' '.join(s.split())
doc = nlp(s)
tokenized_text = ' '.join([token.text for token in doc])
oracle = []
coref2name = {}

# POS TAG
sent = Sentence(tokenized_text)
tagger_ner.predict(sent)
tagger_pos.predict(sent)
tagged_list = sent.to_tagged_string().split()
tokens_pos = []
pos = []
count = 0
for i in range(0,len(tagged_list),2):
  tokens_pos.append(tagged_list[i])
  count = count+1
  pos.append(tagged_list[i+1])

  if tagged_list[i].lower() in male_pronouns+female_pronouns:
      oracle.append(2)
  elif tagged_list[i+1] in ['<B-PER/NNP>', '<I-PER/NNP>', '<E-PER/NNP>', '<S-PER/NNP>']:
      oracle.append(4)
  else:
      oracle.append(0)


In [56]:
for x,y,z in zip(tokens_pos, pos, oracle):
    print(x, "\t", y, "\t", z)

Mr 	 <NNP> 	 1
Masum 	 <B-PER/NNP> 	 5
Hasan 	 <E-PER/NNP> 	 5
is 	 <VBZ> 	 0
not 	 <RB> 	 0
a 	 <DT> 	 0
doctor 	 <NN> 	 0
. 	 <.> 	 0
Masum 	 <S-PER/NNP> 	 5
is 	 <VBZ> 	 0
an 	 <DT> 	 0
engineer 	 <NN> 	 0
. 	 <.> 	 0


### Coreference

In [49]:
doc

Mr Masum Hasan is not a doctor. Masum is an engineer.

In [50]:
doc._.coref_clusters

[Mr Masum Hasan: [Mr Masum Hasan, Masum]]

In [54]:
doc[8]

Masum

In [53]:
doc[8]._.in_coref

True

In [55]:
# COREFERENCE RESOLUTION 
coref_stack = []
name_stack = []
for i in range(len(doc)):
  token = doc[i]
  if token._.in_coref:
      coref_stack.append(tokens_pos[i])
      if oracle[i] == 4:
          name_stack.append(tokens_pos[i])
      oracle[i] += 1
  else:
      if len(name_stack) > 0:
          name = ' '.join(name_stack)
          coref = ' '.join(coref_stack)
          #name2coref[name] = coref
          coref2name[coref] = name
          name_stack.clear()
      coref_stack.clear()

# IF THE SENTENCE DOES NOT END WITH A PERIOD OR SPECIAL CHARACTER
if len(name_stack) > 0:
  name = ' '.join(name_stack)
  name2coref[name] = ' '.join(coref_stack)
  name_stack.clear()
coref_stack.clear()

**Assumption: The sentence will end with period or any other special token**

In [70]:
Name2Key = {}
Key2Name = {}
encrypted = []
num_keys = 0
i = 0
while i<len(tokens_pos): 
  #print("Oracle ", i, tokens_pos[i])
  if oracle[i] == 2:
      pronoun = tokens_pos[i].lower()
      if pos[i] == '<PRP$>':
          pronoun+="$"
      encrypted.append(gender_pronouns_dict[pronoun]["merged"])
  elif oracle[i] == 3:
      coref = doc[i]._.coref_clusters[0][0].text
      pronoun = tokens_pos[i].lower()
      if pos[i] == '<PRP$>':
          pronoun+="$"
      if coref in coref2name:
        name_found = coref2name[coref]
        key = Name2Key[name_found]["key"]
        encrypted.append("<|coref|>")
        encrypted.append(gender_pronouns_dict[pronoun]["merged"])
        encrypted.append(key)
      else:
        encrypted.append(gender_pronouns_dict[pronoun]["merged"])

  elif oracle[i] in [4,5]:
      if i > 0 and tokens_pos[i-1] in gender_honorific_dict:
        hons = encrypted.pop()
        encrypted.append("<|hons|>")
        encrypted.append(gender_honorific_dict[hons]["merged"])
      if pos[i] == '<S-PER/NNP>':
          name = tokens_pos[i]
      elif pos[i] == '<B-PER/NNP>':
          name = ""
          while True:
              #print(i, oracle[i])
              name += tokens_pos[i]
              if pos[i] == '<E-PER/NNP>':
                  break
              name += " "
              i+=1

      if oracle[i] == 4:
          key, num_keys = store(name, name, Name2Key, Key2Name, num_keys)
          encrypted.append(key)
      else:
          coref = doc[i]._.coref_clusters[0][0].text
          name_found = coref2name[coref]
          if name != name_found:
              encrypted.append("<|alias|>")
            
          key, num_keys = store(name, name_found, Name2Key, Key2Name, num_keys)
          encrypted.append(key)
  else:
    encrypted.append(tokens_pos[i])
  i+=1

encrypted_text = ' '.join(encrypted)

print(tokenized_text, Key2Name, encrypted_text)

1 5
2 5
Mr Masum Hasan is not a doctor . Masum is an engineer . {'PER_1': {'name': 'Masum Hasan', 'key': 'PER_1', 'gender': 'male', 'alias': 'Masum', 'is_alias': False, 'alias_to': None}} <|hons|> Mr/Ms PER_1 is not a doctor . <|alias|> PER_1 is an engineer .


In [31]:
encrypted

['PER_1',
 'told',
 'newsmen',
 'that',
 "'",
 'BNP',
 'is',
 'a',
 'massive',
 'party',
 'with',
 'huge',
 'popularity',
 'and',
 'public',
 'acceptance',
 '.',
 "'"]

In [65]:
Name2Key

{'Masum Hasan': {'name': 'Masum Hasan',
  'key': 'PER_1',
  'gender': 'Male',
  'alias': 'Masum',
  'is_alias': False,
  'alias_to': None},
 'Masum': {'name': 'Masum', 'is_alias': True, 'alias_to': 'Masum Hasan'}}

In [66]:
Key2Name

{'PER_1': {'name': 'Masum Hasan',
  'key': 'PER_1',
  'gender': 'Male',
  'alias': 'Masum',
  'is_alias': False,
  'alias_to': None}}

In [71]:
{'name': 'Masum Hasan', 'gender': 'male', 'alias': 'Masum'}

{'name': 'Masum Hasan', 'gender': 'male', 'alias': 'Masum'}