# Exploring Gender Biases in Word2vec <br> 

Write a short intro here 


In [2]:
#Imports
import torch
from torch import nn, optim, sigmoid, softmax
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchtext.data.utils import get_tokenizer
import string

## 1- Define Word2Vec Model 

In [None]:
# NOTES:
# inspiration: https://github.com/OlgaChernytska/word2vec-pytorch
# We are using the skip gram method because it works better on smaller data sets (which we are limited to), source -  https://arxiv.org/pdf/1301.3781.pdf 
# Skim-gram predicts a context word from a given middle word in the sentance.
# Use adam optimizer (better than SGD) - source: _______
# Applying regularization to our embedding layer: clamping each weiaght vector of each neuron to |w| < 1 ( via max_norm ==1) - prevent exploding gradients
  #the norm restrictions also helps push cosine similarity between words (which we will use to analyze bias)



**Create Tokens** <br>

Justifications:

- punctuation: remove - source : ______________
- lowercase words - because we are only looking at biasis between male/female

In [40]:
def create_tokens(data, min_freq):
    """
    create_vocabulary finds the number of occurences of each word, creates vocuabulary with words with > N_freq, and lebels them with ids

    :param data: list of strings
    :return: a list of tokens 
    """ 
    tokens = []
    for sentance in data:
      #split into tokens, convert to lowercase 
      sentance = sentance.translate(str.maketrans('', '', string.punctuation)) #remove punctuations
      sentance = sentance.translate(str.maketrans('', '', string.digits)) #remove digits
      tokenizer = get_tokenizer("basic_english", language="en") #remove unessasary characters, splits into spaces
      tokens.append(tokenizer(sentance))
  
    return tokens
  

**Define word 2 vec model** <br>

- choosing the top N most frequent words - why? so its easier to train - and not wasting time to train words that appear once in a text sequence. 

In [42]:
from gensim.models import Word2Vec
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = Word2Vec(sentences, min_count=1)

In [46]:
from gensim.models import Word2Vec

min_freq = 50 #hyper parameter

sentences = create_tokens(["hey my name is", "you are coool"], min_freq)
print(sentences)
model = Word2Vec(sentences, min_count=1)

[['hey', 'my', 'name', 'is'], ['you', 'are', 'coool']]


## 2 - Create Data sets with wiki and ArXiv from the Pile

In [None]:
# Credit: https://github.com/noanabeshima/wikipedia-downloader
!pip install tensorflow==2.2.0
#2.8.0
#2.2.0
!pip install tensorflow-datasets=4.3
# !pip install tfds-nightly==3.1.0.dev202007060105
!pip install fire==0.3.1
!pip install tqdm==4.47.0
!pip install joblib==0.15.1
!pip install apache-beam==2.22.0


Collecting tensorflow==2.2.0
  Downloading tensorflow-2.2.0-cp37-cp37m-manylinux2010_x86_64.whl (516.2 MB)
[K     |████████████████████████████████| 516.2 MB 4.2 kB/s 
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting tensorboard<2.3.0,>=2.2.0
  Downloading tensorboard-2.2.2-py3-none-any.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 63.5 MB/s 
Collecting tensorflow-estimator<2.3.0,>=2.2.0
  Downloading tensorflow_estimator-2.2.0-py2.py3-none-any.whl (454 kB)
[K     |████████████████████████████████| 454 kB 59.5 MB/s 
Collecting h5py<2.11.0,>=2.10.0
  Downloading h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 44.5 MB/s 
Installing collected packages: tensorflow-estimator, tensorboard, h5py, gast, tensorflow
  Attempting uninstall: tensorflow-estimator
    Found existing installation: tensorflow-estimator 2.8.0
    Uninstalling tensorflow-estimator-2.8.0:
      Successfully 

Collecting joblib==0.15.1
  Downloading joblib-0.15.1-py3-none-any.whl (298 kB)
[?25l[K     |█                               | 10 kB 21.0 MB/s eta 0:00:01[K     |██▏                             | 20 kB 27.4 MB/s eta 0:00:01[K     |███▎                            | 30 kB 21.6 MB/s eta 0:00:01[K     |████▍                           | 40 kB 17.1 MB/s eta 0:00:01[K     |█████▌                          | 51 kB 6.5 MB/s eta 0:00:01[K     |██████▋                         | 61 kB 7.5 MB/s eta 0:00:01[K     |███████▊                        | 71 kB 8.5 MB/s eta 0:00:01[K     |████████▊                       | 81 kB 9.2 MB/s eta 0:00:01[K     |█████████▉                      | 92 kB 10.1 MB/s eta 0:00:01[K     |███████████                     | 102 kB 8.4 MB/s eta 0:00:01[K     |████████████                    | 112 kB 8.4 MB/s eta 0:00:01[K     |█████████████▏                  | 122 kB 8.4 MB/s eta 0:00:01[K     |██████████████▎                 | 133 kB 8.4 MB/s eta 0:

In [None]:
import os
# import json
# import tensorflow as tf
import tensorflow_datasets as tfds
# from tqdm import tqdm
# from joblib import Parallel, delayed
import fire
# !pip install git+https://github.com/cehorn/GLRM.git
# https://github.com/google/python-fire

def process_article(article):
    # Converts an article to a single text file
    title = article['title'].numpy().decode('UTF-8')
    text = article['text'].numpy().decode('UTF-8')
    return title+"\n\n"+text

def main(n_jobs: int = 1):
    # Downloads wikipedia dataset using tensorflow_datasets into 10 json files
    try:
        os.mkdir('output')
    except:
        pass

    for interval in range(10):
        if f'wikipedia-en-{interval}.json' not in os.listdir('./output'):
            ds = tfds.load('wikipedia/20200301.en', split=f'train[{str(interval)}0%:{str(interval+1)}0%]')

            result = Parallel(n_jobs=n_jobs)(delayed(process_article)(article) for article in tqdm(ds))

            result = json.dumps(result)

            file = open(f"output/wikipedia-en-{interval}.json", "w")
            file.write(result)
            file.close()

if __name__ == '__main__':
    fire.Fire(main)

[1mDownloading and preparing dataset wikipedia/20200301.en/1.0.0 (download: 16.73 GiB, generated: 17.05 GiB, total: 33.77 GiB) to /root/tensorflow_datasets/wikipedia/20200301.en/1.0.0...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



Dl Completed...:   0%|          | 0/258 [00:00<?, ? file/s]

## 3 - Debiasing Techniques


### 3.1 - Zhao et al.'s Method Using an Objective Function with Catigorized Words


### 3.2 - Bolukbasi et al.'s Soft-debaising Method

### 3.3 - Savani et al.'s First Order Optimization

## 4 - Measuring Bias

### 4.1 - Direct Bias

### 4.2 - Indirect Bias


### 4.3 - WEAT Metric

## 5 - Comparison and Conclusions