# Gender Encryption

### Installation

https://github.com/zalandoresearch/flair

https://github.com/huggingface/neuralcoref

In [None]:
#!pip install flair
#!pip uninstall spacy
#!pip install -U spacy==2.1.0
##!python -m spacy download en
#!python -m spacy download en_core_web_lg
##!python -m spacy download en_core_web_md
#!pip install neuralcoref --no-binary neuralcoref

### Creating Resources

In [1]:
male_pronouns   = ["he",  "him", "his$", "his", "himself"]
female_pronouns = ["she", "her", "her$", "hers", "herself"]
neutral_pronouns= ["zie", "zim", "zir", "zis", "zieself"]
merged_pronouns = ["he/she", "him/her", "his/her", "his/hers", "himself/herself"]

In [2]:
gender_pronouns_dict = {}
gender_honorific_dict = {}

In [3]:
for (g1,g2,g3,g4) in zip(male_pronouns, female_pronouns, neutral_pronouns, merged_pronouns):
    element = {"male": g1, "female":g2, "neutral":g3, "merged":g4}
    gender_pronouns_dict[g1] = gender_pronouns_dict[g2] = gender_pronouns_dict[g3] = gender_pronouns_dict[g4] =element

In [4]:
male_hons   =  ["Mr.", "Mr", "Md.", "Md", "Sir", "Lord", "Mister"]
female_hons =  ["Ms.", "Ms", "Mst.", "Mst", "Madam", "Lady", "Miss"]
neutral_hons = ["Mx.", "Mx", "Mx.", "Mx", "Sir/Madam", "Lord/Lady", "Mister/Miss"]
married_hons = ["Mrs.", "Mrs", "Mst.", "Mst", "Madam", "Lady", "Mis'ess"]
merged_hons =  ["Mr./Ms.", "Mr/Ms", "Md./Mst.", "Md/Mst", "Sir/Madam", "Lord/Lady", "Mister/Miss"]

In [5]:
for (h1,h2,h3,h4,h5) in zip(male_hons, female_hons, neutral_hons, married_hons, merged_hons):
    element = {"male": h1, "female":h2, "neutral":h3, "married_fem":h4, "merged":h5}
    gender_honorific_dict[h1] = gender_honorific_dict[h2] = gender_honorific_dict[h3] = \
    gender_honorific_dict[h4] = gender_honorific_dict[h5] = element

### Imports

In [6]:
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single

In [7]:
from keras.models import load_model

Using TensorFlow backend.


In [8]:
import spacy
import neuralcoref

### Loading Essentials

In [9]:
import json

with open('char2idx.json', 'r') as fp:
    char2idx = json.load(fp)
    
with open('idx2char.json', 'r') as fp:
    idx2char = json.load(fp)

In [10]:
model = load_model('char_rnn_hsc_model_0.h5')

2019-11-02 23:14:32,201 From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

2019-11-02 23:14:32,258 From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

2019-11-02 23:14:32,278 From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

2019-11-02 23:14:32,407 From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

2019-11-02 23:14:32,415 From c:\users\hp\appdata\local\programs\python

In [11]:
nlp = spacy.load('en_core_web_lg') # en_core_web_sm #en_core_web_md
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x1ec7111dbe0>

In [12]:
tagger_ner = SequenceTagger.load('ner')
tagger_pos = SequenceTagger.load('pos')

2019-11-02 23:14:51,085 loading file C:\Users\Hp\.flair\models\en-ner-conll03-v0.4.pt
2019-11-02 23:15:12,914 loading file C:\Users\Hp\.flair\models\en-pos-ontonotes-v0.2.pt


### Name2Gender

In [13]:
import numpy as np
from keras.preprocessing import sequence

# Converts a name into vector
def name2vectorTest(name):
    name = name.lower()
    new_name = ""
    for char in name:
      if char in char2idx:
        new_name += char
    chars = list(new_name)
    vector = [ char2idx[c] for c in chars ]
    return np.array(vector)

# Converts names to fixed size tensor
def names2tensorTest(names, maxlen=25):
    namelist = [name2vectorTest(name) for name in names]
    return sequence.pad_sequences(np.array(namelist), maxlen=maxlen)  # root of all troubles

def name2gender(name):
  result = model.predict_classes(np.array(names2tensorTest([name.lower()])))[0][0]
  if result:
    print("Male")
  else:
    print("Female")
  
def isMale(name):
  result = model.predict_classes(np.array(names2tensorTest([name.lower()])))[0][0]
  return result

### Input Sentence

In [46]:
s = """At the same time, in front of a crowd in northeast Mississippi, Trump was mocking the Democratic Party as "completely insane" -- and mocking several of its candidates. He called former Vice President Joe Biden "very slow sleepy Joe." He said he doesn't know who Hawaii Rep. Tulsi Gabbard is. And he said former Texas Rep. Beto O'Rourke, who dropped out of the race Friday, 'quit like a dog.'"""

In [47]:
s = ' '.join(s.split())

### POS-NER

In [48]:
tokens_pos = []
pos = []
oracle = []
coref2name = {}

In [49]:
sent = Sentence(s, use_tokenizer=True)
tagger_ner.predict(sent)
tagger_pos.predict(sent)
tagged_list = sent.to_tagged_string().split()

In [50]:
for i in range(0,len(tagged_list),2):
    tokens_pos.append(tagged_list[i])
    pos.append(tagged_list[i+1])
    
    if tagged_list[i].lower() in male_pronouns+female_pronouns:
        oracle.append(2)
    elif tagged_list[i+1] in ['<B-PER/NNP>', '<I-PER/NNP>', '<E-PER/NNP>', '<S-PER/NNP>']:
        oracle.append(4)
    else:
        oracle.append(0)

### Coreference

In [51]:
doc = nlp(s)

In [52]:
assert len(doc)==len(tokens_pos)

AssertionError: 

In [59]:
spacy_tokens = [token.text for token in doc]

In [64]:
spacy_tokens

['At',
 'the',
 'same',
 'time',
 ',',
 'in',
 'front',
 'of',
 'a',
 'crowd',
 'in',
 'northeast',
 'Mississippi',
 ',',
 'Trump',
 'was',
 'mocking',
 'the',
 'Democratic',
 'Party',
 'as',
 '"',
 'completely',
 'insane',
 '"',
 '--',
 'and',
 'mocking',
 'several',
 'of',
 'its',
 'candidates',
 '.',
 'He',
 'called',
 'former',
 'Vice',
 'President',
 'Joe',
 'Biden',
 '"',
 'very',
 'slow',
 'sleepy',
 'Joe',
 '.',
 '"',
 'He',
 'said',
 'he',
 'does',
 "n't",
 'know',
 'who',
 'Hawaii',
 'Rep.',
 'Tulsi',
 'Gabbard',
 'is',
 '.',
 'And',
 'he',
 'said',
 'former',
 'Texas',
 'Rep.',
 'Beto',
 "O'Rourke",
 ',',
 'who',
 'dropped',
 'out',
 'of',
 'the',
 'race',
 'Friday',
 ',',
 "'",
 'quit',
 'like',
 'a',
 'dog',
 '.',
 "'"]

In [61]:
tokens_pos

['At',
 'the',
 'same',
 'time',
 ',',
 'in',
 'front',
 'of',
 'a',
 'crowd',
 'in',
 'northeast',
 'Mississippi',
 ',',
 'Trump',
 'was',
 'mocking',
 'the',
 'Democratic',
 'Party',
 'as',
 '"',
 'completely',
 'insane',
 '"',
 '--',
 'and',
 'mocking',
 'several',
 'of',
 'its',
 'candidates',
 '.',
 'He',
 'called',
 'former',
 'Vice',
 'President',
 'Joe',
 'Biden',
 '"',
 'very',
 'slow',
 'sleepy',
 'Joe',
 '.',
 '"',
 'He',
 'said',
 'he',
 'does',
 "n't",
 'know',
 'who',
 'Hawaii',
 'Rep',
 '.',
 'Tulsi',
 'Gabbard',
 'is',
 '.',
 'And',
 'he',
 'said',
 'former',
 'Texas',
 'Rep',
 '.',
 'Beto',
 "O'Rourke",
 ',',
 'who',
 'dropped',
 'out',
 'of',
 'the',
 'race',
 'Friday',
 ',',
 "'",
 'quit',
 'like',
 'a',
 'dog',
 '.',
 "'"]

In [57]:
doc.text

'At the same time, in front of a crowd in northeast Mississippi, Trump was mocking the Democratic Party as "completely insane" -- and mocking several of its candidates. He called former Vice President Joe Biden "very slow sleepy Joe." He said he doesn\'t know who Hawaii Rep. Tulsi Gabbard is. And he said former Texas Rep. Beto O\'Rourke, who dropped out of the race Friday, \'quit like a dog.\''

**Assumption: The sentence will end with period or any other special token**

In [None]:
coref_stack = []
name_stack = []
for i in range(len(doc)):
    token = doc[i]
    if token._.in_coref:
        coref_stack.append(tokens_pos[i])
        if oracle[i] == 4:
            name_stack.append(tokens_pos[i])
        oracle[i] += 1
    else:
        if len(name_stack) > 0:
            name = ' '.join(name_stack)
            coref = ' '.join(coref_stack)
            #name2coref[name] = coref
            coref2name[coref] = name
            name_stack.clear()
        coref_stack.clear()

# IF THE SENTENCE DOES NOT END WITH A PERIOD OR SPECIAL CHARACTER
if len(name_stack) > 0:
    name = ' '.join(name_stack)
    name2coref[name] = ' '.join(coref_stack)
    name_stack.clear()
coref_stack.clear()

In [None]:
def store(name, name_found):
    if name_found in Name2Key:
        if name not in Name2Key:
            element = {"name": name, "is_alias": True, "alias_to": name_found}
    else:
        global num_keys
        num_keys+=1
        key = "PER_"+str(num_keys)
        gender = name2gender(name)
        alias = None
        
        if name!=name_found:
            element_alias = {"name": name, "is_alias": True, "alias_to": name_found}
            Name2Key[name] = element_alias
            alias = name
        
        element = {"name": name, "key": key, "gender":gender, "alias":alias, "is_alias": False, "alias_to": None}
        Name2Key[name_found] = element
        Key2Name[key] = element
        
    return Name2Key[name_found]["key"]

In [None]:
Name2Key = {}
Key2Name = {}
encrypted = []
num_keys = 0

i = 0
while i<len(tokens_pos): 
    #print("Oracle ", i, tokens_pos[i])
    if oracle[i] == 0:
        encrypted.append(tokens_pos[i])
    elif oracle[i] == 1:
        if tokens_pos[i] in gender_honorific_dict:
            encrypted.append("<|hons|>")
            encrypted.append(gender_honorific_dict[tokens_pos[i]]["merged"])
        else:
            encrypted.append(tokens_pos[i])
    elif oracle[i] == 2:
        pronoun = tokens_pos[i].lower()
        if pos[i] == '<PRP$>':
            pronoun+="$"
        encrypted.append(gender_pronouns_dict[pronoun]["merged"])
    elif oracle[i] == 3:
        coref = doc[i]._.coref_clusters[0][0].text
        name_found = coref2name[coref]
        key = Name2Key[name_found]["key"]
        encrypted.append("<|coref|>")
        pronoun = tokens_pos[i].lower()
        if pos[i] == '<PRP$>':
            pronoun+="$"
        encrypted.append(gender_pronouns_dict[pronoun]["merged"])
        encrypted.append(key)
        
    elif oracle[i] in [4,5]:
        if pos[i] == '<S-PER/NNP>':
            name = tokens_pos[i]
        else:
            name = ""
            while True:
                name += tokens_pos[i]
                if pos[i] == '<E-PER/NNP>':
                    break
                name += " "
                i+=1
        
        if oracle[i] == 4:
            key = store(name, name)
            encrypted.append(key)
        else:
            coref = doc[i]._.coref_clusters[0][0].text
            name_found = coref2name[coref]
            if name == name_found:
                key = store(name, name_found)
                encrypted.append(key)
            else:
                key = store(name, name_found)
                encrypted.append("<|alias|>")
                encrypted.append(key)
    i+=1

In [None]:
encrypted

In [None]:
Name2Key

In [53]:
Key2Name

{'PER_1': {'name': 'Donald Trump',
  'key': 'PER_1',
  'gender': None,
  'alias': None,
  'is_alias': False,
  'alias_to': None}}