### Imports

In [2]:
from keras.models import load_model

Using TensorFlow backend.


In [3]:
import spacy, nltk
import pandas as pd
import en_core_web_lg
nlp = en_core_web_lg.load()

In [4]:
from spacy.pipeline import EntityRecognizer
ner = EntityRecognizer(nlp.vocab)

In [5]:
import json

with open('char2idx.json', 'r') as fp:
    char2idx = json.load(fp)
    
with open('idx2char.json', 'r') as fp:
    idx2char = json.load(fp)

In [6]:
model = load_model('char_rnn_hsc_model_0.h5')

W1109 05:08:54.082749 11812 deprecation_wrapper.py:119] From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1109 05:08:54.230531 11812 deprecation_wrapper.py:119] From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1109 05:08:54.269160 11812 deprecation_wrapper.py:119] From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1109 05:08:54.460735 11812 deprecation_wrapper.py:119] From c:\users\hp\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. P

### Methods

In [7]:
import numpy as np
from keras.preprocessing import sequence

# Converts a name into vector
def name2vectorTest(name):
    name = name.lower()
    new_name = ""
    for char in name:
      if char in char2idx:
        new_name += char
    chars = list(new_name)
    vector = [ char2idx[c] for c in chars ]
    return np.array(vector)

# Converts names to fixed size tensor
def names2tensorTest(names, maxlen=25):
    namelist = [name2vectorTest(name) for name in names]
    return sequence.pad_sequences(np.array(namelist), maxlen=maxlen)  # root of all troubles

def name2gender(name):
  result = model.predict_classes(np.array(names2tensorTest([name.lower()])))[0][0]
  if result:
    return "male"
  else:
    return "female"
  
def isMale(name):
  result = model.predict_classes(np.array(names2tensorTest([name.lower()])))[0][0]
  return result

In [8]:
NER_FAR_VAL = 99999

def has_ner_gender(doc):
    if not doc.ents:
        return (NER_FAR_VAL, 0, 0)
    
    per_ents = set()
    entity_fist_position = NER_FAR_VAL
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            per_ents.add(ent.text)
            entity_fist_position = ent.start

    if not per_ents:
        return (NER_FAR_VAL, 0, 0)
    
    male_mentions = 0
    female_mentions = 0
    for per in per_ents:
        male = isMale(per)
        if male:
            male_mentions+=1
        else:
            female_mentions+=1
    return (entity_fist_position, male_mentions, female_mentions)

In [9]:
POS_FAR_VAL = 9999
male_pronoun_list = ["he", "his", "him", "himself"]
female_pronoun_list = ["she", "her", "her","herself"]
def pron_pos(doc):
    pron_position = POS_FAR_VAL
    male_mention = 0
    female_mention = 0
    for token in doc:
        if token.tag_ in ['PRP', 'PRP$'] and token.text.lower() in male_pronoun_list+female_pronoun_list:
            if pron_position == POS_FAR_VAL:
                pron_position = token.idx
            if token.text.lower() in male_pronoun_list:
                male_mention += 1
            elif token.text.lower() in female_pronoun_list:
                female_mention += 1
    return (pron_position, male_mention, female_mention)

## Parsing Gender Associated Sentences

In [11]:
def get_gender_sents(article, MAX_TOKEN_NUM):
    sent_list = nltk.sent_tokenize(article)
    corpora = []
    sent_stack = []
    stack_count = 0
    male_mention = female_mention = 0

    for sent in sent_list:
        sent = ' '.join(sent.split())
        doc = nlp(sent)
        ner_position, male_mention1, female_mention1 = has_ner_gender(doc)
        pron_position, male_mention2, female_mention2 = pron_pos(doc)

        #print(doc, ner_position, pron_position)
        
        if not sent_stack:
            if ner_position < pron_position and len(doc) <= MAX_TOKEN_NUM:
                sent_stack.append(sent)
                stack_count += len(doc)
                male_mention += male_mention1+male_mention2
                female_mention += female_mention1+female_mention2
        else:
            if (pron_position != POS_FAR_VAL or ner_position != NER_FAR_VAL) and (stack_count+len(doc) <= MAX_TOKEN_NUM):
                sent_stack.append(sent)
                stack_count += len(doc)
                male_mention += male_mention1+male_mention2
                female_mention += female_mention1+female_mention2
            else:
                corpus = ' '.join(sent_stack)
                corpora.append((corpus, male_mention, female_mention))
                sent_stack.clear()
                stack_count = 0
                male_mention = female_mention = 0
                if ner_position < pron_position and len(doc) <= MAX_TOKEN_NUM:
                    sent_stack.append(sent)
                    stack_count += len(doc)
                    male_mention += male_mention1+male_mention2
                    female_mention += female_mention1+female_mention2
    return corpora

## Read & Write Data

In [279]:
df = pd.DataFrame(columns = ['text', 'male_mention', 'female_mention']) 

In [276]:
article = """Tanvi Jhumur, second wife of lawmaker Moazzem Hossain Ratan of Sunamganj-1, has been terminated from the post of assistant teacher of Teghoria Government Primary School of Sunamganj Sadar upazila.

The directorate of primary education (DPE) has sacked her from the job after allegations of drawing regular salary and other allowances without teaching for months ware found true against her.

District primary education officer (DPEO) of Sunamganj Md Jillur Rahman confirmed this to media around 3:30pm on Friday.
He said, “Jhumur has been terminated from her job as per the directives of the director general of the DPE.”

A letter with recommendation for departmental actions against the sacked assistant teacher has already been sent to the primary and mass education ministry, Jillur added.

Earlier on the same day, several media reports claimed that Tanvi Jhumur has been absent from her workplace without getting leave for last 10 months but she is drawing her monthly salary on regular basis.

Jhumur took a one-day leave from her office due to illness 10 months ago but she has remained absent from the office since then, according to Teghoria Govt Primary School authorities.

On information, Jhumur is currently living with her husband in a flat of NAM Bhaban on Manik Mia Avenue in the capital.
"""

In [277]:
corpora = get_gender_sents(article, 40)

In [281]:
df2 = pd.DataFrame(corpora, columns = ['text', 'male_mention', 'female_mention']) 
df2

Unnamed: 0,text,male_mention,female_mention
0,"Tanvi Jhumur, second wife of lawmaker Moazzem ...",1,1
1,District primary education officer (DPEO) of S...,2,0
2,"Earlier on the same day, several media reports...",0,4
3,Jhumur took a one-day leave from her office du...,0,3


In [282]:
df = pd.concat([df2,df])

In [1]:
df

NameError: name 'df' is not defined

## Reading the Dataset

452406 Articles total

1.5M sentences approximately

In [12]:
input_path = "C:/Bangla NMT/tokenized_data/gender_sentences.en.txt"

In [13]:
output_path = "C:/Bangla NMT/tokenized_data/gender_only_sentences.en.txt"

In [24]:
class MySentences(object):
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
      for line in open(self.filename, encoding="utf-8"):
        if line != "<sep>\n":
          yield line

In [25]:
articles = MySentences(input_path)

In [26]:
fout = open(output_path, 'a+', encoding="utf-8")

In [27]:
for x in articles:
    print(x)
    break

The Ant-Corruption Commission (ACC) has summoned 312 lower grades employees of the Mongla Sea Port on allegation of pursuing irregularities in recruitment. These employees submitted fake academic credentials or certificates on quota and hid there ages during the appointment. Ninety percent of the employees work as cook in marine vessels owned by the port authorities and as crane helper at the port’s machinery and power department. Speaking to Bangla Tribune, ACC Khulna office Deputy Director Neel Kamol Pal said, “The employees who got job in the port showing false documents and hiding ages have been summoned for the sake of investigation.” A source at the port’s personnel department told Bangla Tribune, “A total of 312 employs were recruited in 2013 and 2014. “The graft watchdog summoned 28 employees in July this year to retake the job test amid allegation of being incapable for the post surfaced against a majority of the staff,” the sources added. On Thursday (Jul 2), the ACC quizzed 

Saved till 88000

In [37]:
import time
i = 0
df = pd.DataFrame(columns = ['text', 'male_mention', 'female_mention']) 
start = time.time()
for article in articles:
    corpora = get_gender_sents(article, 40)
    df2 = pd.DataFrame(corpora, columns = ['text', 'male_mention', 'female_mention'])
    df = pd.concat([df2,df])
    if i % 1000 == 0:
        df.to_csv("gender_sentences.csv", index=False)
        end = time.time()
        print("Saving...", i, "Num sent: ", len(df), "time:", end-start)
    if i >= 100000:
        break
    #print(i, "Num sent: ", len(corpora), "time:", end-start)
    i+=1

Saving... 0 Num sent:  0 time: 0.13065457344055176
Saving... 1000 Num sent:  3143 time: 182.56134462356567
Saving... 2000 Num sent:  6687 time: 387.2574963569641
Saving... 3000 Num sent:  10139 time: 588.3058552742004
Saving... 4000 Num sent:  13651 time: 787.3686850070953
Saving... 5000 Num sent:  17185 time: 997.0548481941223
Saving... 6000 Num sent:  20849 time: 1225.499365568161
Saving... 7000 Num sent:  24453 time: 1444.9454538822174
Saving... 8000 Num sent:  28092 time: 1670.1265094280243
Saving... 9000 Num sent:  31350 time: 1893.5275020599365
Saving... 10000 Num sent:  34902 time: 2140.434942960739
Saving... 11000 Num sent:  38050 time: 2377.675488471985
Saving... 12000 Num sent:  41423 time: 2619.143359184265
Saving... 13000 Num sent:  44948 time: 2872.474336862564
Saving... 14000 Num sent:  47681 time: 3095.8759858608246
Saving... 15000 Num sent:  50841 time: 3309.9302082061768
Saving... 16000 Num sent:  53585 time: 3495.768751144409
Saving... 17000 Num sent:  58165 time: 378

KeyboardInterrupt: 

In [36]:
len(df)

241

In [33]:
next(articles, 1000)

TypeError: 'MySentences' object is not an iterator

## DEBIASING