In [1]:
import json
import logging
import os
import pandas as pd
from tqdm import tqdm
from nltk import ngrams
import multiprocessing as mp
from math import log

In [2]:
#func
def load_doc(filename):
    file = open(filename, 'r', encoding='utf-8')
    text = file.read()
    file.close()
    return text


def getUpper(word):
    word = word[0].upper() + word[1:]
    return word


# gender pair list taken from GN-Glove
DEFAULT_MALE_NOUNS = load_doc('male_word_file.txt').split('\n')
DEFAULT_FEMALE_NOUNS = load_doc('female_word_file.txt').split('\n')


def gender_ratios_m_f(data, maleCount, femaleCount):
    scoresP = []
    scoresPC = []
    bias_record = {}
    for words in data:
        if (data[words]['m'] + data[words]['f'] > MinCount):
            if words in occupations:
                score = abs(
                    log((data[words]['m'] + 1) / (data[words]['f'] + 1)))
                scoresP.append(score)

                pm = data[words]['m'] / (maleCount + 0.00001)
                pf = data[words]['f'] / (femaleCount + 0.00001)
                score_c = abs(log((pm + 0.00001) / (pf + 0.00001)))
                scoresPC.append(score_c)

                bias_record[words] = log(
                    (data[words]['m'] + 1) / (data[words]['f'] + 1))
    return scoresP, scoresPC, bias_record


# Taken from jtcrammer repo
def get_cooccurrences(file, data, window):
    with open(file, 'r', encoding='utf-8') as fp:
        sentences = fp.read()
    male_nouns = DEFAULT_MALE_NOUNS
    female_nouns = DEFAULT_FEMALE_NOUNS
    n_grams = ngrams(sentences.split(), window)

    for grams in n_grams:
        pos = 1
        m = 0
        f = 0
        for w in grams:
            pos += 1
            if w not in data:
                data[w] = {"m": 0, "f": 0}

            if pos == int((window + 1) / 2):
                if w in male_nouns:
                    m = 1
                if w in female_nouns:
                    f = 1
                if m > 0:
                    for t in grams:
                        if t not in data:
                            data[t] = {"m": 0, "f": 0}
                        data[t]['m'] += 1
                if f > 0:
                    for t in grams:
                        if t not in data:
                            data[t] = {"m": 0, "f": 0}
                        data[t]['f'] += 1
    return data


def getGenderCounts(file):
    with open(file, 'r', encoding='utf-8') as fp:
        sentences = fp.read()
    tokens = sentences.split()
    maleCount = len([w for w in tokens if w in DEFAULT_MALE_NOUNS])
    femaleCount = len([w for w in tokens if w in DEFAULT_FEMALE_NOUNS])
    return maleCount, femaleCount

In [3]:
#occu
window = 10
MinCount = 10
txtlengths = [12000, 14000, 16000, 18000]

occupations = load_doc('occupations.txt').split('\n')
occupations += [getUpper(w) for w in occupations if len(w) > 0]

for txtlength in txtlengths:
    biasPList = []
    biasPCList = []
    scorePList = []
    scorePCList = []
    scoreDictList = []
    maleRatio = []
    global_count_female = 0
    df_per_txt = pd.DataFrame()
    data_dir = f"./5000-{txtlength}"
    print('\nPresent length:{}'.format(txtlength))

    for gender in [0, 1]:
        if gender == 0:
            txtDir = f'{data_dir}/female/'
        else:
            txtDir = f'{data_dir}/male/'

        print("Processing gender: {}".format(txtDir))
        blogs_gender = os.listdir(txtDir)
        print("Files:", len(os.listdir(txtDir)))
        for i in tqdm(range(0, len(blogs_gender))):
            m = blogs_gender[i]
            maleCount = 0
            femaleCount = 0
            data = {}
            txt_path = txtDir + m
            data = get_cooccurrences(txt_path, data, window)
            mCount, fCount = getGenderCounts(txt_path)
            maleCount = mCount
            femaleCount = fCount
            scores_p, scores_pc, scores_dict = gender_ratios_m_f(
                data, maleCount, femaleCount)
            global_count_female += 1

            row_dict = dict()

            row_dict['word ratio'] = (maleCount + 0.000001) / (
                maleCount + femaleCount + 0.000001)
            row_dict['male'] = maleCount + 0.000001
            row_dict['female'] = femaleCount + 0.000001
            row_dict['bias'] = sum(scores_p) / max(len(scores_p), 1)
            row_dict['biasPC'] = sum(scores_pc) / max(len(scores_pc), 1)
            row_dict['scores_dict'] = scores_dict
            row_dict['gender'] = gender

            df_per_txt = df_per_txt.append(row_dict, ignore_index=True)

    df_per_txt.to_csv(f'5000-{txtlength}_genderbias-occu.csv', index=False)

  0%|          | 1/2251 [00:00<05:13,  7.17it/s]


Present length:12000
Processing gender: ./5000-12000/female/
Files: 2251


100%|██████████| 2251/2251 [05:15<00:00,  6.93it/s]
  0%|          | 1/2140 [00:00<05:26,  6.55it/s]

Processing gender: ./5000-12000/male/
Files: 2140


100%|██████████| 2140/2140 [04:44<00:00,  7.63it/s]
  0%|          | 1/2213 [00:00<05:50,  6.30it/s]


Present length:14000
Processing gender: ./5000-14000/female/
Files: 2213


100%|██████████| 2213/2213 [05:30<00:00,  6.77it/s]
  0%|          | 1/2083 [00:00<05:11,  6.69it/s]

Processing gender: ./5000-14000/male/
Files: 2083


100%|██████████| 2083/2083 [05:10<00:00,  6.65it/s]
  0%|          | 1/2179 [00:00<06:02,  6.00it/s]


Present length:16000
Processing gender: ./5000-16000/female/
Files: 2179


100%|██████████| 2179/2179 [06:06<00:00,  5.92it/s]
  0%|          | 1/2043 [00:00<05:53,  5.77it/s]

Processing gender: ./5000-16000/male/
Files: 2043


100%|██████████| 2043/2043 [05:54<00:00,  5.44it/s]
  0%|          | 0/2153 [00:00<?, ?it/s]


Present length:18000
Processing gender: ./5000-18000/female/
Files: 2153


100%|██████████| 2153/2153 [06:52<00:00,  4.89it/s]
  0%|          | 0/2003 [00:00<?, ?it/s]

Processing gender: ./5000-18000/male/
Files: 2003


100%|██████████| 2003/2003 [06:34<00:00,  5.11it/s]


In [4]:
#emo
window = 10
MinCount = 10

occupations = load_doc('emotions.txt').split('\n')
occupations += [getUpper(w) for w in occupations if len(w) > 0]

for txtlength in txtlengths:
    biasPList = []
    biasPCList = []
    scorePList = []
    scorePCList = []
    scoreDictList = []
    maleRatio = []
    global_count_female = 0
    df_per_txt = pd.DataFrame()
    data_dir = f"./5000-{txtlength}"
    print('\nPresent length:{}'.format(txtlength))

    for gender in [0, 1]:
        if gender == 0:
            txtDir = f'{data_dir}/female/'
        else:
            txtDir = f'{data_dir}/male/'

        print("Processing gender: {}".format(txtDir))
        blogs_gender = os.listdir(txtDir)
        print("Files:", len(os.listdir(txtDir)))
        for i in tqdm(range(0, len(blogs_gender))):
            m = blogs_gender[i]
            maleCount = 0
            femaleCount = 0
            data = {}
            txt_path = txtDir + m
            data = get_cooccurrences(txt_path, data, window)
            mCount, fCount = getGenderCounts(txt_path)
            maleCount = mCount
            femaleCount = fCount
            scores_p, scores_pc, scores_dict = gender_ratios_m_f(
                data, maleCount, femaleCount)
            global_count_female += 1

            row_dict = dict()

            row_dict['word ratio'] = (maleCount + 0.000001) / (
                maleCount + femaleCount + 0.000001)
            row_dict['male'] = maleCount + 0.000001
            row_dict['female'] = femaleCount + 0.000001
            row_dict['bias'] = sum(scores_p) / max(len(scores_p), 1)
            row_dict['biasPC'] = sum(scores_pc) / max(len(scores_pc), 1)
            row_dict['scores_dict'] = scores_dict
            row_dict['gender'] = gender

            df_per_txt = df_per_txt.append(row_dict, ignore_index=True)

    df_per_txt.to_csv(f'5000-{txtlength}_genderbias-emo.csv', index=False)

  0%|          | 1/2251 [00:00<05:47,  6.48it/s]


Present length:12000
Processing gender: ./5000-12000/female/
Files: 2251


100%|██████████| 2251/2251 [05:06<00:00,  7.51it/s]
  0%|          | 1/2140 [00:00<04:43,  7.56it/s]

Processing gender: ./5000-12000/male/
Files: 2140


100%|██████████| 2140/2140 [04:51<00:00,  7.29it/s]
  0%|          | 1/2213 [00:00<05:55,  6.22it/s]


Present length:14000
Processing gender: ./5000-14000/female/
Files: 2213


100%|██████████| 2213/2213 [05:49<00:00,  6.10it/s]
  0%|          | 1/2083 [00:00<05:36,  6.18it/s]

Processing gender: ./5000-14000/male/
Files: 2083


100%|██████████| 2083/2083 [05:29<00:00,  6.34it/s]
  0%|          | 1/2179 [00:00<06:18,  5.76it/s]


Present length:16000
Processing gender: ./5000-16000/female/
Files: 2179


100%|██████████| 2179/2179 [06:32<00:00,  5.57it/s]
  0%|          | 1/2043 [00:00<06:07,  5.56it/s]

Processing gender: ./5000-16000/male/
Files: 2043


100%|██████████| 2043/2043 [06:07<00:00,  5.61it/s]
  0%|          | 1/2153 [00:00<06:57,  5.16it/s]


Present length:18000
Processing gender: ./5000-18000/female/
Files: 2153


100%|██████████| 2153/2153 [07:10<00:00,  5.11it/s]
  0%|          | 0/2003 [00:00<?, ?it/s]

Processing gender: ./5000-18000/male/
Files: 2003


100%|██████████| 2003/2003 [06:40<00:00,  5.03it/s]


In [5]:
#verb
window = 10
MinCount = 10

occupations = load_doc('verb.txt').split('\n')
occupations += [getUpper(w) for w in occupations if len(w) > 0]

for txtlength in txtlengths:
    biasPList = []
    biasPCList = []
    scorePList = []
    scorePCList = []
    scoreDictList = []
    maleRatio = []
    global_count_female = 0
    df_per_txt = pd.DataFrame()
    data_dir = f"./5000-{txtlength}"
    print('\nPresent length:{}'.format(txtlength))

    for gender in [0, 1]:
        if gender == 0:
            txtDir = f'{data_dir}/female/'
        else:
            txtDir = f'{data_dir}/male/'

        print("Processing gender: {}".format(txtDir))
        blogs_gender = os.listdir(txtDir)
        print("Files:", len(os.listdir(txtDir)))
        for i in tqdm(range(0, len(blogs_gender))):
            m = blogs_gender[i]
            maleCount = 0
            femaleCount = 0
            data = {}
            txt_path = txtDir + m
            data = get_cooccurrences(txt_path, data, window)
            mCount, fCount = getGenderCounts(txt_path)
            maleCount = mCount
            femaleCount = fCount
            scores_p, scores_pc, scores_dict = gender_ratios_m_f(
                data, maleCount, femaleCount)
            global_count_female += 1

            row_dict = dict()

            row_dict['word ratio'] = (maleCount + 0.000001) / (
                maleCount + femaleCount + 0.000001)
            row_dict['male'] = maleCount + 0.000001
            row_dict['female'] = femaleCount + 0.000001
            row_dict['bias'] = sum(scores_p) / max(len(scores_p), 1)
            row_dict['biasPC'] = sum(scores_pc) / max(len(scores_pc), 1)
            row_dict['scores_dict'] = scores_dict
            row_dict['gender'] = gender

            df_per_txt = df_per_txt.append(row_dict, ignore_index=True)

    df_per_txt.to_csv(f'5000-{txtlength}_genderbias-verb.csv', index=False)

  0%|          | 1/2251 [00:00<04:57,  7.55it/s]


Present length:12000
Processing gender: ./5000-12000/female/
Files: 2251


100%|██████████| 2251/2251 [05:07<00:00,  7.36it/s]
  0%|          | 1/2140 [00:00<04:51,  7.33it/s]

Processing gender: ./5000-12000/male/
Files: 2140


100%|██████████| 2140/2140 [04:54<00:00,  7.30it/s]
  0%|          | 1/2213 [00:00<05:37,  6.56it/s]


Present length:14000
Processing gender: ./5000-14000/female/
Files: 2213


100%|██████████| 2213/2213 [05:50<00:00,  6.09it/s]
  0%|          | 1/2083 [00:00<05:32,  6.25it/s]

Processing gender: ./5000-14000/male/
Files: 2083


100%|██████████| 2083/2083 [05:24<00:00,  6.39it/s]
  0%|          | 1/2179 [00:00<06:25,  5.65it/s]


Present length:16000
Processing gender: ./5000-16000/female/
Files: 2179


100%|██████████| 2179/2179 [06:27<00:00,  5.82it/s]
  0%|          | 1/2043 [00:00<05:51,  5.80it/s]

Processing gender: ./5000-16000/male/
Files: 2043


100%|██████████| 2043/2043 [06:08<00:00,  5.39it/s]
  0%|          | 0/2153 [00:00<?, ?it/s]


Present length:18000
Processing gender: ./5000-18000/female/
Files: 2153


100%|██████████| 2153/2153 [07:15<00:00,  5.02it/s]
  0%|          | 0/2003 [00:00<?, ?it/s]

Processing gender: ./5000-18000/male/
Files: 2003


100%|██████████| 2003/2003 [06:43<00:00,  4.78it/s]
