### Imports

In [84]:
from keras.models import load_model

Using TensorFlow backend.


In [151]:
import spacy, nltk
import pandas as pd
import en_core_web_lg
nlp = en_core_web_lg.load()

In [152]:
from spacy.pipeline import EntityRecognizer
ner = EntityRecognizer(nlp.vocab)

In [85]:
import json

with open('char2idx.json', 'r') as fp:
    char2idx = json.load(fp)
    
with open('idx2char.json', 'r') as fp:
    idx2char = json.load(fp)

In [86]:
model = load_model('char_rnn_hsc_model_0.h5')

W1108 15:56:03.132026 20132 deprecation_wrapper.py:119] From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1108 15:56:03.381767 20132 deprecation_wrapper.py:119] From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1108 15:56:03.416742 20132 deprecation_wrapper.py:119] From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1108 15:56:03.586371 20132 deprecation_wrapper.py:119] From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. P

### Methods

In [108]:
import numpy as np
from keras.preprocessing import sequence

# Converts a name into vector
def name2vectorTest(name):
    name = name.lower()
    new_name = ""
    for char in name:
      if char in char2idx:
        new_name += char
    chars = list(new_name)
    vector = [ char2idx[c] for c in chars ]
    return np.array(vector)

# Converts names to fixed size tensor
def names2tensorTest(names, maxlen=25):
    namelist = [name2vectorTest(name) for name in names]
    return sequence.pad_sequences(np.array(namelist), maxlen=maxlen)  # root of all troubles

def name2gender(name):
  result = model.predict_classes(np.array(names2tensorTest([name.lower()])))[0][0]
  if result:
    return "male"
  else:
    return "female"
  
def isMale(name):
  result = model.predict_classes(np.array(names2tensorTest([name.lower()])))[0][0]
  return result

In [212]:
NER_FAR_VAL = 99999

def has_ner_gender(doc):
    if not doc.ents:
        return (NER_FAR_VAL, 0, 0)
    
    per_ents = set()
    entity_fist_position = NER_FAR_VAL
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            per_ents.add(ent.text)
            entity_fist_position = ent.start

    if not per_ents:
        return (NER_FAR_VAL, 0, 0)
    
    male_mentions = 0
    female_mentions = 0
    for per in per_ents:
        male = isMale(per)
        if male:
            male_mentions+=1
        else:
            female_mentions+=1
    return (entity_fist_position, male_mentions, female_mentions)

In [213]:
POS_FAR_VAL = 9999
male_pronoun_list = ["he", "his", "him", "himself"]
female_pronoun_list = ["she", "her", "her","herself"]
def pron_pos(doc):
    pron_position = POS_FAR_VAL
    male_mention = 0
    female_mention = 0
    for token in doc:
        if token.tag_ in ['PRP', 'PRP$'] and token.text.lower() in male_pronoun_list+female_pronoun_list:
            if pron_position == POS_FAR_VAL:
                pron_position = token.idx
            if token.text.lower() in male_pronoun_list:
                male_mention += 1
            elif token.text.lower() in female_pronoun_list:
                female_mention += 1
    return (pron_position, male_mention, female_mention)

In [214]:
pron_pos(doc)

(9999, 0, 0)

In [215]:
doc[1].idx

1

In [216]:
doc

“Happy birthday to the most beautiful girl in the world.

## Parsing Gender Associated Sentences

In [248]:
article = """Tanvi Jhumur, second wife of lawmaker Moazzem Hossain Ratan of Sunamganj-1, has been terminated from the post of assistant teacher of Teghoria Government Primary School of Sunamganj Sadar upazila.

The directorate of primary education (DPE) has sacked her from the job after allegations of drawing regular salary and other allowances without teaching for months ware found true against her.

District primary education officer (DPEO) of Sunamganj Md Jillur Rahman confirmed this to media around 3:30pm on Friday.
He said, “Jhumur has been terminated from her job as per the directives of the director general of the DPE.”

A letter with recommendation for departmental actions against the sacked assistant teacher has already been sent to the primary and mass education ministry, Jillur added.

Earlier on the same day, several media reports claimed that Tanvi Jhumur has been absent from her workplace without getting leave for last 10 months but she is drawing her monthly salary on regular basis.

Jhumur took a one-day leave from her office due to illness 10 months ago but she has remained absent from the office since then, according to Teghoria Govt Primary School authorities.

On information, Jhumur is currently living with her husband in a flat of NAM Bhaban on Manik Mia Avenue in the capital.
"""

In [260]:
def get_gender_sents(article, MAX_TOKEN_NUM):
    sent_list = nltk.sent_tokenize(article)
    corpora = []
    sent_stack = []
    stack_count = 0
    male_mention = female_mention = 0

    for sent in sent_list:
        sent = ' '.join(sent.split())
        doc = nlp(sent)
        ner_position, male_mention1, female_mention1 = has_ner_gender(doc)
        pron_position, male_mention2, female_mention2 = pron_pos(doc)

        #print(doc, ner_position, pron_position)
        
        if not sent_stack:
            if ner_position < pron_position and len(doc) <= MAX_TOKEN_NUM:
                sent_stack.append(sent)
                stack_count += len(doc)
                male_mention += male_mention1+male_mention2
                female_mention += female_mention1+female_mention2
        else:
            if (pron_position != POS_FAR_VAL or ner_position != NER_FAR_VAL) and (stack_count+len(doc) <= MAX_TOKEN_NUM):
                sent_stack.append(sent)
                stack_count += len(doc)
                male_mention += male_mention1+male_mention2
                female_mention += female_mention1+female_mention2
            else:
                corpus = ' '.join(sent_stack)
                corpora.append((corpus, male_mention, female_mention))
                sent_stack.clear()
                stack_count = 0
                male_mention = female_mention = 0
                if ner_position < pron_position and len(doc) <= MAX_TOKEN_NUM:
                    sent_stack.append(sent)
                    stack_count += len(doc)
                    male_mention += male_mention1+male_mention2
                    female_mention += female_mention1+female_mention2
    return corpora

## Read & Write Data

In [271]:
df2 = pd.DataFrame(corpora, columns = ['text', 'male_mention', 'female_mention']) 
df2

Unnamed: 0,text,male_mention,female_mention
0,"Tanvi Jhumur, second wife of lawmaker Moazzem ...",1,1
1,District primary education officer (DPEO) of S...,2,0
2,"Earlier on the same day, several media reports...",0,4
3,Jhumur took a one-day leave from her office du...,0,3


In [269]:
df = pd.DataFrame(columns = ['text', 'male_mention', 'female_mention']) 

In [274]:
df = pd.concat([df2,df])

In [275]:
df

Unnamed: 0,text,male_mention,female_mention
0,"Tanvi Jhumur, second wife of lawmaker Moazzem ...",1,1
1,District primary education officer (DPEO) of S...,2,0
2,"Earlier on the same day, several media reports...",0,4
3,Jhumur took a one-day leave from her office du...,0,3
