In [2]:
import numpy as np
import pandas as pd
import pickle
import cvxpy as cp

import matplotlib.pyplot as plt
import seaborn as sns 
sns.set()
%matplotlib inline

In [3]:
# open and read file
with open('ruscorpora_upos_skipgram_300_5_2018.vec', encoding='utf-8') as f: #НКРЯ
    lines = f.readlines()

In [4]:
#check length of file 
len(lines)

195072

In [5]:
# get the list of words in the word embeddings
words = [lines[i].split()[0] for i in range(1, len(lines))]

In [6]:
# read file of chosen professions ("ГАРАНТ system")

df_prof = pd.read_csv('./professions.csv')

In [None]:
# full list of professions which we found in our corpus (professions for which we have word embeddings)
professions_list = []
for word in words:
    for prof in set(df_prof['профессии']):
        if word == prof + '_NOUN':
            professions_list.append(word)

In [None]:
# need to make a matrix out of the list of strings

lines_formatrix = lines[1:]
words_list = []
for i in range(len(lines_formatrix)):
    split_line = lines_formatrix[i].split(' ', 1)
    words_list.append(split_line[0])
    lines_formatrix[i] = split_line[1]
    
E = np.loadtxt(lines_formatrix)       #it is a matrix with all word embeddings from our model RNC 

In [None]:
# find the indexes of the profession (from the professions.csv) in the word embedding matrix
ind_prof = [i for i in range(len(words_list)) if words_list[i] in professions_list]

In [None]:
# create P matrix of vectors for the professions
P = E[ind_prof,:]

# check the shape of P 
P.shape

In [None]:
# try to use male-female word pairs to get gender axis
# open and preprocess word pairs
with open('word_pairs.txt','r', encoding = 'utf-8') as of:
    word_pairs = of.readlines()
    
for i in range(len(word_pairs)):
    word_pairs[i] = word_pairs[i].strip()
word_pairs[0] = 'мужчина_NOUN'   

# find the indexes of these word pairs in the embedding matrix
ind_nouns = []
for i in range(len(words_list)):
    for elem in word_pairs:        
        if elem == words_list[i]:
            ind_nouns.append(i)
            
# separate indexes for male and female nouns
ind_male_noun = []
ind_female_noun = []
for i in range(len(ind_nouns)):
    if i % 2 == 0:
        ind_male_noun.append(ind_nouns[i])
    if i % 2 != 0:
        ind_female_noun.append(ind_nouns[i])
        
# make male and female matrix
male_matrix = E[ind_male_noun,:]
female_matrix = E[ind_female_noun,:]

# find the difference matrix (for the gender axis) and the use PCA to find the main principle component
from sklearn.decomposition import PCA

difference_matrix = male_matrix - female_matrix
pca = PCA(n_components = 10)
pca_forprojection = pca.fit(difference_matrix)
gender_axis_pca = pca_forprojection.components_[0]

# check the singular value decay
singular_values = pca_forprojection.singular_values_
print(singular_values)
# the singular value decay is not good, so we will use just man-woman vector for gender axis

In [None]:
# find vectors for мужчина и женщина
ind_man = words_list.index('мужчина_NOUN')
ind_woman = words_list.index('женщина_NOUN')

man = E[ind_man, :]
woman = E[ind_woman, :]
# b vector is gender axis
b = man - woman

In [None]:
# find projections of professions onto gender axis

def find_proj(P, b):
    projections = []
    gender_axis_norm = (b)/np.linalg.norm(b) # normalized gender axis

    for i in range(P.shape[0]):
        proj = np.dot(P[i,:], gender_axis_norm)
        projections.append(proj)
    return projections

In [None]:
# plot distribution of projections

projections = find_proj(P, b)

plt.hist(projections)
plt.title('Projections of professions onto man-woman direction', size =14 )
plt.xlabel(' '*10 + '<---- woman' + ' '*20 + ' ----> man', size = 14)


In [None]:
# find the bounds for extreme professions
right_bound = np.mean(projections) + 1.5*np.std(projections)
left_bound = np.mean(projections) - 1*np.std(projections)

In [None]:
# find and print the extreme professions for both male

ind_var_male = []
for i in range(len(projections)):
    if projections[i] > right_bound:
        ind_var_male.append(i)
        
for i in ind_var_male:
    print(professions_list[i])

In [None]:
# find and the extreme professions for both male and female

ind_var = []
for i in range(len(projections)):
    if projections[i] < left_bound or projections[i] >  right_bound:
        ind_var.append(i)
        
for i in ind_var:
    print(professions_list[i])

In [None]:
# create file with selected extreme professions
with open('professions_for_selection','w',encoding = 'utf-8') as of:
    for i in ind_var:
        of.write(professions_list[i] + '\n')
    of.close()
    
with open('professions_after_selection','r', encoding = 'utf-8') as of:
    slist_rev = of.readlines()

In [None]:
# list of professions after selection (63 professions)
slist_rev = [slist_rev[i].strip() for i in range(len(slist_rev)-1)]

In [None]:
slist_rev

In [None]:
# indices in words_list of 63 chosen professions
ind_prof_rev = [i for i in range(len(words_list)) if words_list[i] in slist_rev]                         

In [None]:
# make A by deleting the extreme professions and man and woman vectors
A = np.delete(E, ind_prof_rev+[ind_man]+[ind_woman], axis=0)

In [None]:
A.shape

In [None]:
# make P out of the extreme professions
P_rev= E[ind_prof_rev,:]

In [None]:
P_rev.shape

In [None]:
# SVD 
u, s, vt = np.linalg.svd(A, full_matrices = False)

In [None]:
# save SVD matrices

with open('matrix_s', 'wb') as f:
        pickle.dump(s, f)
        
with open('matrix_vt', 'wb') as f:
        pickle.dump(vt, f)

In [None]:
#if you don't have memory for computing SVD you can load them
with open('matrix_s', 'rb') as f:
    s = pickle.load(f)

with open('matrix_vt', 'rb') as f:
    vt = pickle.load(f)

In [None]:
# reshape B for optimization
B_reshaped = b.reshape((1,300))

In [None]:
# OPTIMIZATION

lam = 0.2
S = np.diag(s)
I = np.eye(300)

X = cp.Variable((300,300))

constraints = [X >> 0]

obj = cp.Minimize(cp.norm((S* vt *(X - I)* vt.T* S), "fro")**2 + lam*(cp.norm(((P_rev * X) * B_reshaped.T), 'fro')**2))


prob = cp.Problem(obj, constraints)
result = prob.solve(verbose=True, max_iters = 300)

In [None]:
X_res = X.value

In [None]:
#cholesky decompostion for finding transformation matrix T
T = np.linalg.cholesky(X_res)   

In [None]:
#result validation
b_after_T = T.T @ b.T

print('standard derivation of extreme professions before transformation: ', np.round(np.std(find_proj(P_rev, b)),2))
print('standard derivation of extreme professions after transformation: ', np.round(np.std(find_proj(P_rev @ T, b_after_T)),2))

In [None]:
# THE REST OF THE CODE IS FOR GRAPHS FOR THE PRESENTATION

In [None]:
wforValid = ['губернатор_NOUN','менеджер_NOUN','шеф-повар_NOUN',
             'учитель_NOUN','повар_NOUN','библиотекарь_NOUN']

In [None]:
ind_biased = [words_list.index(word) for word in wforValid]

In [None]:
P_biased = E[ind_biased]

In [None]:
projections_biased = find_proj(P_biased, b)

In [None]:
projections_biased

In [None]:
ind_debiased = [slist_rev.index(word) for word in wforValid]

In [None]:
P_debiased = (P_rev @ T)[ind_debiased]

In [None]:
projections_debiased = find_proj(P_debiased, b_after_T)

In [None]:
projections_debiased