In [4]:
import numpy as np
from numpy.linalg import norm
import scipy, requests, codecs, os, re, nltk, itertools, csv
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from scipy.stats import spearmanr
import pandas as pd
import functools as ft
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import gdown
import random
import datetime

# Load GloVe 

In [5]:
def loadWordVecs(model_str):
    word_dictionary = {}
    
    input_file_destination = model_str +'_wiki_vectors.txt'

    f = codecs.open(input_file_destination, 'r', 'utf-8') 
    x = 0
    
    count = 0
    for line in f:
        count +=1
        line = line.split(" ", 1)
        if len(line) != 2:
            print(count)
            continue
        transformed_key = line[0]

        try:
            transformed_key = str(transformed_key)

        except:
            print("Can't convert the key to unicode:", transformed_key)

        word_dictionary[transformed_key] = np.fromstring(line[1], dtype="float32", sep=" ")

        if word_dictionary[transformed_key].shape[0] != 300 and x == 0:
            print(transformed_key, word_dictionary[transformed_key].shape)
            x += 1

    return  word_dictionary     

orig_glove = loadWordVecs('glove')
len(orig_glove)

322636

# Select set Z

In [6]:
def select_word_of_desired_pos(word, pos):
    words = []
    for w in word:
        # no single letter word
        if len(w) == 1:
            continue
        tag = nltk.pos_tag([w])
        if tag[0][1] in pos:
            words.append(tag[0][0])
    return words

In [7]:
desired_pos = [
                'JJ', # adjective base form -> comparative + superlative  + adverb
                # 'NN', # singular noun -> plural 'NNS'
                # 'NNP', # proper noun singular 'Harrison' -> proper noun plural 'NNPS'
                # 'PRP', # personal pronoun 'he' -> possesive pronoun 'his' PRP$' 
                # 'VB', # verb base form 'take' -> 'VBD' took, 'VBG' taking, 'VBN' taken, 'VBZ' takes
                # 'VBP', # single present, non-3rd person 'take'
                ]
noun_pos = ['NN']

In [8]:
def gender_dist(word, gender_mat):
    score = []
    word = np.array(word)
    word = word/norm(word)
    score = word.dot(gender_mat)
    return np.mean(score)

In [13]:
def select_word_of_pos_verbose(all_words, desired_pos):
    start = 0
    selected_words = []
    while start < len(all_words):
        if start>10000:
            print('Scanning words %d/%d'%(start/10000, int(len(all_words)/10000)))
        end = start + 10000
        if end > len(all_words):
            end = len(all_words)
        selected_words = selected_words + select_word_of_desired_pos(all_words[start:end], desired_pos)
        start = start + 10000
    return selected_words

### Load gendered words

In [14]:
female_words = pd.read_csv('./data/A_female.csv')
female_words = list(female_words['0'])

male_words = pd.read_csv('./data/A_male.csv')
male_words = list(male_words['0'])

gender_words = male_words+female_words

gender_mat = np.array([orig_glove[w] for w in gender_words])
norm_gender_mat = gender_mat/norm(gender_mat, axis=1).reshape(-1,1)
norm_gender_mat = norm_gender_mat.T

### Construct set Z and save the rest as set X.

In [15]:
f = open('./screening/output.txt', 'w', encoding='utf-8')
f.close()

s = '90'
# load words
from_file = './screening/X'+s+'_names.txt'

all_words=[]
with open(from_file, "r+", encoding='utf-8') as f_in:
    for line in f_in:
        all_words.append(line.replace('\n',''))   
print('Processing file '+ from_file)

# delete corrupted ones
to_del = []
for w in all_words:
    if w not in orig_glove.keys():
        to_del.append(w)
for w in to_del:
    all_words.remove(w)

# filter out adj and nouns
adj_words = select_word_of_pos_verbose(all_words, desired_pos)
noun_words = all_words

# compute cosine similarity
word_dist = []
for word in noun_words:
    word_dist.append(gender_dist(orig_glove[word], norm_gender_mat))
    if len(word_dist)%20000 == 0:
        print(datetime.datetime.now(), "  ", len(word_dist), ' / ', len(word_df.columns))
word_dist = np.array(word_dist)
std = word_dist.std()


extracted_words = [all_words[1]]
extracted_mat = np.array([orig_glove[w] for w in extracted_words]).T
if len(extracted_words) > 0:
    extracted_df = pd.DataFrame(extracted_mat, columns=extracted_words)
else:
    extracted_df = dummy_df
with open('./screening/E_mat_'+s+'.csv', 'w', encoding='utf-8') as file:
    extracted_df.to_csv(file, index=True, header=True, line_terminator='\n')

# save the rest to X
words_rest = set(all_words) - set(extracted_words)
mat_rest = np.array([orig_glove[w] for w in words_rest]).T
df_rest = pd.DataFrame(mat_rest, columns=words_rest)
with open('./screening/X_mat_'+s+'.csv', 'w', encoding='utf-8') as file:
    df_rest.to_csv(file, index=True, header=True, line_terminator='\n')

with open('./screening/output.txt', 'a', encoding='utf-8') as file:
    file.write('Length of X_mat_'+s+'.csv is '+str(len(extracted_df.columns))+' out of '+str(len(all_words))
               +', accounting to '+str(round(len(extracted_df.columns)/len(all_words)*100))+'%.\n')
with open('./screening/output.txt', 'a', encoding='utf-8') as file:
    file.write('\n')

Processing file ./screening/X90_names.txt
