In [2]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import torch

import sys
sys.path.append('/home/nauel/bert_gender_bias')
import warnings
warnings.filterwarnings("ignore")


from pipelines.utils.paths import EXTERNAL_DATA_DIR, INTERIM_DATA_DIR

In [3]:
gendered_words = pd.read_pickle(os.path.join(INTERIM_DATA_DIR, 'gender_binary_words_TOKEN.pkl'))
jobs_df = pd.read_pickle(os.path.join(INTERIM_DATA_DIR, 'occupations_TOKEN.pkl'))

In [4]:
gendered_words.dropna(inplace=True)
print(gendered_words.shape)
gendered_words.head(1000)

(431, 5)


Unnamed: 0,word,gender_binary,bert_token,word2vec_token,word_count
0,abbot,0,"[-0.39571184, -0.093838364, 0.06868138, 0.1723...","[0.40039062, 0.41015625, 0.36523438, 0.2207031...",1
1,abbots,0,"[-0.18790531, -0.077521764, -0.3876859, 0.1816...","[0.30664062, 0.33398438, 0.1953125, 0.50390625...",1
2,adulterer,0,"[-0.45352724, -0.38397712, -0.25277817, -0.266...","[0.42773438, -0.28515625, -0.0625, 0.020263672...",1
3,adulterers,0,"[-0.17249976, -0.15101261, -0.3717648, -0.0279...","[-0.14160156, -0.22558594, -0.042236328, 0.208...",1
4,airman,0,"[-0.20044291, -0.020562049, -0.276353, -0.0372...","[0.49023438, -0.15917969, -0.022460938, -0.024...",1
...,...,...,...,...,...
510,woman,1,"[-0.34769166, -0.20875905, 0.12518345, 0.07620...","[0.24316406, -0.07714844, -0.103027344, -0.107...",1
511,womankind,1,"[-0.24836135, -0.283562, -0.19828826, 0.071805...","[-0.096191406, -0.055908203, 0.014404297, 0.22...",1
512,womanly,1,"[-0.3582225, -0.31381592, -0.16148914, 0.21661...","[0.01965332, -0.059570312, 0.119628906, 0.1474...",1
513,womanpower,1,"[-0.042981803, -0.32953677, 0.07738724, 0.1452...","[-0.0041503906, -0.06225586, 0.044921875, 0.13...",1


In [5]:
jobs_df.dropna(inplace=True)
print(jobs_df.shape)
gendered_words.head()

(340, 4)


Unnamed: 0,Occupations,bert_token,word2vec_token,word_count
0,Accountant,"[0.044247508, -0.062065974, -0.28353956, 0.019...","[0.059326172, -0.1328125, -0.16796875, -0.1816...",1
6,Actor,"[0.104604356, -0.23240899, -0.5950049, -0.1360...","[0.33789062, -0.028442383, 0.111328125, -0.181...",1
7,Actress,"[0.044713546, -0.2707646, -0.26443234, -0.0969...","[0.24023438, -0.20214844, 0.16894531, -0.20410...",1
8,Actuary,"[-0.102212645, -0.09090995, -0.14266734, -0.08...","[0.032714844, -0.41210938, -0.14550781, 0.3691...",1
9,Acupuncturist,"[0.05669831, -0.22730672, -0.20539528, -0.2271...","[0.16894531, -0.29296875, 0.007507324, -0.1503...",1


In [6]:
token_cols = ['word2vec_token','bert_token', ]

import sklearn

from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

for token_col in token_cols:
    print(token_col)
    
    X = gendered_words[token_col].values.tolist()
    y = np.array(gendered_words.gender_binary.tolist())
    clf = make_pipeline(StandardScaler(),
                        LinearSVC(random_state=0, tol=1e-5))
    clf.fit(X, y)

    print("Accuracy: ", clf.score(X, y))
    

word2vec_token
Accuracy:  0.9976798143851509
bert_token
Accuracy:  0.9976798143851509


In [7]:
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

# Fit logistic regression with L1 regularization
for token_col in token_cols:
    print(token_col)
    
    X = gendered_words[token_col].values.tolist()
    y = np.array(gendered_words.gender_binary.tolist())
    logit = LogisticRegression(solver="liblinear", penalty="l1", C=0.1)
    logit.fit(X, y)
    print("Accuracy: ", logit.score(X, y))
    print("Number of non-zero coefficients: ", np.sum(logit.coef_ != 0))
    print("Number of zero coefficients: ", np.sum(logit.coef_ == 0))
    print("Number of coefficients: ", len(logit.coef_[0]))

word2vec_token
Accuracy:  0.877030162412993
Number of non-zero coefficients:  8
Number of zero coefficients:  292
Number of coefficients:  300
bert_token
Accuracy:  0.8074245939675174
Number of non-zero coefficients:  11
Number of zero coefficients:  757
Number of coefficients:  768


In [14]:
# Get the coefficients
coefficients = logit.coef_[0]
coefficients_abs = np.abs(coefficients)

logit_coef = pd.DataFrame({'feature': range(len(coefficients)), 'coefficient': coefficients, 'absolute_coefficient': coefficients_abs})
logit_coef = logit_coef.sort_values(by='absolute_coefficient', ascending=False)
logit_coef = logit_coef[logit_coef.absolute_coefficient > 0]
logit_coef.head(20)


Unnamed: 0,feature,coefficient,absolute_coefficient
52,52,1.106613,1.106613
1,1,-0.726055,0.726055
586,586,0.60946,0.60946
74,74,-0.556978,0.556978
374,374,-0.436251,0.436251
668,668,-0.385063,0.385063
765,765,-0.31059,0.31059
104,104,-0.271868,0.271868
394,394,0.05343,0.05343
14,14,-0.008283,0.008283


In [16]:
selected_features = logit_coef.feature.values

jobs_df['selected_tokens'] = jobs_df.bert_token.apply(lambda x: [x[i] for i in selected_features])
jobs_df.head()

Unnamed: 0,Occupations,bert_token,word2vec_token,word_count,selected_tokens,pc1,pc2,pc1_norm,pc2_norm
0,Accountant,"[0.044247508, -0.062065974, -0.28353956, 0.019...","[0.059326172, -0.1328125, -0.16796875, -0.1816...",1,"[-0.19978876, -0.062065974, -0.2217183, 0.1216...",-0.328126,-0.068276,0.11833,0.405408
6,Actor,"[0.104604356, -0.23240899, -0.5950049, -0.1360...","[0.33789062, -0.028442383, 0.111328125, -0.181...",1,"[0.37949157, -0.23240899, -0.285422, 0.2991309...",0.032785,-0.284343,0.188251,0.257535
7,Actress,"[0.044713546, -0.2707646, -0.26443234, -0.0969...","[0.24023438, -0.20214844, 0.16894531, -0.20410...",1,"[0.4146122, -0.2707646, -0.2680458, 0.26214847...",-0.149352,-0.011998,0.152965,0.443923
8,Actuary,"[-0.102212645, -0.09090995, -0.14266734, -0.08...","[0.032714844, -0.41210938, -0.14550781, 0.3691...",1,"[0.016106425, -0.09090995, 0.0009229609, 0.196...",0.164899,-0.10801,0.213846,0.378215
9,Acupuncturist,"[0.05669831, -0.22730672, -0.20539528, -0.2271...","[0.16894531, -0.29296875, 0.007507324, -0.1503...",1,"[-0.15688932, -0.22730672, 0.46686733, -0.1121...",1.803289,0.458955,0.531259,0.766236


In [18]:
from sklearn.decomposition import PCA

# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)

X = jobs_df['selected_tokens'].tolist()
principal_components = pca.fit_transform(X)
print("Explained variance: ", pca.explained_variance_ratio_)
jobs_df['pc1'] = principal_components[:, 0]
jobs_df['pc2'] = principal_components[:, 1]

jobs_df['pc1_norm'] = (jobs_df['pc1'] - jobs_df['pc1'].min()) / (jobs_df['pc1'].max() - jobs_df['pc1'].min())
jobs_df['pc2_norm'] = (jobs_df['pc2'] - jobs_df['pc2'].min()) / (jobs_df['pc2'].max() - jobs_df['pc2'].min())

jobs_df.head()

Explained variance:  [0.6598749  0.06972593]


Unnamed: 0,Occupations,bert_token,word2vec_token,word_count,selected_tokens,pc1,pc2,pc1_norm,pc2_norm
0,Accountant,"[0.044247508, -0.062065974, -0.28353956, 0.019...","[0.059326172, -0.1328125, -0.16796875, -0.1816...",1,"[-0.19978876, -0.062065974, -0.2217183, 0.1216...",-0.328126,-0.068276,0.11833,0.405408
6,Actor,"[0.104604356, -0.23240899, -0.5950049, -0.1360...","[0.33789062, -0.028442383, 0.111328125, -0.181...",1,"[0.37949157, -0.23240899, -0.285422, 0.2991309...",0.032785,-0.284343,0.188251,0.257535
7,Actress,"[0.044713546, -0.2707646, -0.26443234, -0.0969...","[0.24023438, -0.20214844, 0.16894531, -0.20410...",1,"[0.4146122, -0.2707646, -0.2680458, 0.26214847...",-0.149352,-0.011998,0.152965,0.443923
8,Actuary,"[-0.102212645, -0.09090995, -0.14266734, -0.08...","[0.032714844, -0.41210938, -0.14550781, 0.3691...",1,"[0.016106425, -0.09090995, 0.0009229609, 0.196...",0.164899,-0.10801,0.213846,0.378215
9,Acupuncturist,"[0.05669831, -0.22730672, -0.20539528, -0.2271...","[0.16894531, -0.29296875, 0.007507324, -0.1503...",1,"[-0.15688932, -0.22730672, 0.46686733, -0.1121...",1.803289,0.458955,0.531259,0.766236


In [11]:
import plotly.express as px
pcs = ['pc1', 'pc2']

fig = px.scatter(jobs_df, x=pcs[0], y=pcs[1], hover_name='Occupations',
                 title='Interactive Map of Job Titles',
                 labels={pcs[0]: 'Principal Component 1', pcs[1]: 'Principal Component 2'})

# Update layout to improve the appearance
fig.update_layout(title='Interactive Job Titles Map',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  hovermode='closest')

# Show the plot
fig.show()

In [12]:
n = 10

top_n_desc_pc1 = jobs_df.nlargest(n, pcs[0]).Occupations.values
top_n_asc_pc1 = jobs_df.nsmallest(n, pcs[0]).Occupations.values
top_n_desc_pc2 = jobs_df.nlargest(n, pcs[1]).Occupations.values
top_n_asc_pc2 = jobs_df.nsmallest(n, pcs[1]).Occupations.values

top_n_desc_pc1_values = jobs_df.nlargest(n, pcs[0])[f'{pcs[0]}'].values
top_n_asc_pc1_values = jobs_df.nsmallest(n, pcs[0])[f'{pcs[0]}'].values
top_n_desc_pc2_values = jobs_df.nlargest(n, pcs[1])[f'{pcs[1]}'].values
top_n_asc_pc2_values = jobs_df.nsmallest(n, pcs[1])[f'{pcs[1]}'].values

table = pd.DataFrame({
    'Top 20 Descending PC1': top_n_desc_pc1,
    'PC1_desc': top_n_desc_pc1_values,
    'Top 20 Ascending PC1': top_n_asc_pc1,
    'PC1_asc': top_n_asc_pc1_values,
    'Top 20 Descending PC2': top_n_desc_pc2,
    'PC2_desc': top_n_desc_pc2_values,
    'Top 20 Ascending PC2': top_n_asc_pc2,
    'PC2_asc': top_n_asc_pc2_values
})

table

Unnamed: 0,Top 20 Descending PC1,PC1_desc,Top 20 Ascending PC1,PC1_asc,Top 20 Descending PC2,PC2_desc,Top 20 Ascending PC2,PC2_asc
0,Orthopaedic,4.222789,Miner,-0.938913,Chambermaid,0.800524,Showman,-0.660645
1,Physiotherapist,3.695267,Proprietor,-0.897602,Thatcher,0.67116,Merchandiser,-0.61638
2,Paediatrician,2.994359,Regulator,-0.880869,Nanny,0.655017,Footballer,-0.581848
3,Hypnotherapist,2.706143,Chartered,-0.873734,Psychoanalyst,0.626943,Importer,-0.553037
4,Anaesthetist,2.542445,Shepherd,-0.869951,Maid,0.595882,Treasurer,-0.530108
5,Palaeontologist,2.214234,Operations,-0.867494,Midwife,0.594708,Balloonist,-0.521696
6,Paramedic,2.209218,Doctor,-0.83072,Matron,0.546732,Metallurgist,-0.49568
7,Psychotherapist,2.204556,Dealer,-0.820622,Housewife,0.545894,Merchant,-0.476741
8,Restaurateur,2.192577,Employee,-0.820273,Potter,0.513818,Negotiator,-0.473036
9,Osteopath,2.053368,Attendant,-0.817364,Hairdresser,0.509636,Undertaker,-0.462981
