In [251]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import torch

import sys
sys.path.append('/home/nauel/bert_gender_bias')
import warnings
warnings.filterwarnings("ignore")


from pipelines.utils.paths import EXTERNAL_DATA_DIR, INTERIM_DATA_DIR

In [304]:
gendered_words = pd.read_csv(os.path.join(INTERIM_DATA_DIR, 'gender_binary_words.csv'), sep="|")
jobs_df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'occupations.csv'), sep="|")
print(jobs_df.head())
print(gendered_words.head())

          Occupations
0          Accountant
1  Accounts Assistant
2      Accounts Clerk
3    Accounts Manager
4      Accounts Staff
         word  gender_binary
0       abbot              0
1      abbots              0
2   adulterer              0
3  adulterers              0
4      airman              0


# Tokenize Gendered Words

In [252]:
# apply bert tokenizer to the words and create another col for it
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

text = gendered_words.word.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

gendered_words['bert_token'] = list(word_embeddings)




In [253]:
gendered_words

Unnamed: 0,word,gender_binary,bert_token
0,abbot,0,"[-0.39571184, -0.093838364, 0.06868138, 0.1723..."
1,abbots,0,"[-0.18790531, -0.077521764, -0.3876859, 0.1816..."
2,adulterer,0,"[-0.45352724, -0.38397712, -0.25277817, -0.266..."
3,adulterers,0,"[-0.17249976, -0.15101261, -0.3717648, -0.0279..."
4,airman,0,"[-0.20044291, -0.020562049, -0.276353, -0.0372..."
...,...,...,...
510,woman,1,"[-0.34769166, -0.20875905, 0.12518345, 0.07620..."
511,womankind,1,"[-0.24836135, -0.283562, -0.19828826, 0.071805..."
512,womanly,1,"[-0.3582225, -0.31381592, -0.16148914, 0.21661..."
513,womanpower,1,"[-0.042981803, -0.32953677, 0.07738724, 0.1452..."


# Linear SVM 

In [254]:
import sklearn

from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = np.array(preprocessing.normalize(gendered_words.bert_token.tolist(), norm='l2'))
y = np.array(gendered_words.gender_binary.tolist())
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5))
clf.fit(X, y)

print("Accuracy: ", clf.score(X, y))


Accuracy:  0.9980582524271845


In [281]:
def get_top_features(clf):
    svc = clf.named_steps['linearsvc']

    coefficients = svc.coef_
    intercept = svc.intercept_

    flat_coefficients = coefficients.flatten()

    coef_df = pd.DataFrame(flat_coefficients, columns=['coefficient'])
    coef_df.index.name = 'feature'
    coef_df.reset_index(inplace=True)
    coef_df['absolute_coefficient'] = np.abs(coef_df['coefficient'])
    svm_coef = coef_df.sort_values(by='absolute_coefficient', ascending=False)
    
    return svm_coef

svm_coef = get_top_features(clf)
svm_coef.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
286,286,0.093813,0.093813
251,251,0.090118,0.090118
521,521,0.080613,0.080613
636,636,-0.077488,0.077488
592,592,0.072606,0.072606
500,500,-0.072015,0.072015
464,464,0.070462,0.070462
708,708,-0.069187,0.069187
404,404,0.068837,0.068837
525,525,-0.068009,0.068009


In [282]:
def predict_gender(test_word, tokenizer, model, clf):
    test_word_encoded = tokenizer(test_word, return_tensors='pt', padding=True, truncation=True)
    
    with torch.no_grad():
        output = model(**test_word_encoded)
        
    test_word_embedding = output.last_hidden_state.mean(dim=1).numpy()
    prediction = clf.predict(test_word_embedding)
    
    prediction_dict = {0: 'male', 1: 'female'}
    print("Word: ", test_word)
    print("Prediction: ", prediction_dict[prediction[0]])

predict_gender('aerobics instructor', tokenizer, model, clf)


Word:  aerobics instructor
Prediction:  male


In [283]:
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

# Fit logistic regression with L1 regularization
logit = LogisticRegression(solver="liblinear", penalty="l1")
logit.fit(X, y)
print("Accuracy: ", logit.score(X, y))

Accuracy:  0.8679611650485437


In [286]:
# Get the coefficients
coefficients = logit.coef_[0]
coefficients_abs = np.abs(coefficients)

logit_coef = pd.DataFrame({'feature': range(len(coefficients)), 'coefficient': coefficients, 'absolute_coefficient': coefficients_abs})
logit_coef = logit_coef.sort_values(by='absolute_coefficient', ascending=False)
logit_coef = logit_coef[logit_coef.absolute_coefficient > 0]
logit_coef.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
52,52,14.650919,14.650919
1,1,-11.008606,11.008606
74,74,-9.811957,9.811957
668,668,-5.637709,5.637709
653,653,5.520532,5.520532
394,394,5.160087,5.160087
141,141,4.268275,4.268275
765,765,-3.395684,3.395684
586,586,3.061048,3.061048
374,374,-2.782938,2.782938


In [285]:
def predict_gender(test_word, tokenizer, model, logit):
    test_word_encoded = tokenizer(test_word, return_tensors='pt', padding=True, truncation=True)
    
    with torch.no_grad():
        output = model(**test_word_encoded)
        
    test_word_embedding = output.last_hidden_state.mean(dim=1).numpy()
    prediction = logit.predict(test_word_embedding)
    
    prediction_dict = {0: 'male', 1: 'female'}
    
    predict_prob = logit.predict_proba(test_word_embedding)
    
    print("Word: ", test_word)
    print("Maleness coefficient: ", predict_prob[0][0])
    print("Femaleness coefficient: ", predict_prob[0][1])

predict_gender('student', tokenizer, model, logit)

Word:  student
Maleness coefficient:  0.8801195657480964
Femaleness coefficient:  0.11988043425190362


## Using SVM coefficients

In [331]:
# n_features = 50
# selected_features = svm_coef.feature[:n_features].values
# selected_features

array([286, 251, 521, 636, 592, 500, 464, 708, 404, 525, 417, 276, 237,
        52, 342, 373, 141,  15, 253, 258, 670, 394,  74, 374, 741, 478,
       138, 618, 248, 247, 102, 756,  90, 710, 353, 575, 702, 319, 260,
       443,  67, 481, 406, 448, 532, 208, 514, 393, 503, 492])

## Using Logistic Regression Coefficients

In [366]:
selected_features = logit_coef.feature.values
selected_features

array([ 52,   1,  74, 668, 653, 394, 141, 765, 586, 374,  41,  77, 104,
       493, 138, 308, 549, 351])

# Tokenize Jobs

In [367]:
text = jobs_df.Occupations.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

n_feature_embeddings = word_embeddings[:, selected_features]
n_feature_embeddings

array([[-0.19978876, -0.06206597,  0.12169666, ..., -4.4832196 ,
        -0.30024964,  0.36842126],
       [ 0.04419399, -0.10641553, -0.12715694, ..., -4.3804145 ,
        -0.48359773,  0.38138357],
       [ 0.09784252, -0.05605094, -0.1385598 , ..., -4.178088  ,
        -0.25966644,  0.4798153 ],
       ...,
       [-0.1922184 , -0.17913511,  0.23922016, ..., -4.672367  ,
        -0.37988734,  0.21258233],
       [-0.39185643, -0.03170935,  0.17734289, ..., -4.952965  ,
        -0.15553771,  0.30577588],
       [-0.16156787,  0.09780957,  0.25877026, ..., -4.044263  ,
        -0.2726462 ,  0.622368  ]], dtype=float32)

In [368]:
jobs_df['bert_token'] = list(word_embeddings)
jobs_df['n_feature_embeddings'] = list(n_feature_embeddings)

# Perform PCA

In [395]:
from sklearn.decomposition import PCA

# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
principal_components = pca.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
jobs_df['pc1'] = principal_components[:, 0]
jobs_df['pc2'] = principal_components[:, 1]

jobs_df.head()

Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,single_component
0,Accountant,"[0.044247508, -0...","[-0.19978876, -0...",-0.239164,0.248409,-0.239163
1,Accounts Assistant,"[0.06723147, -0....","[0.044193994, -0...",-0.108442,0.405987,-0.108442
2,Accounts Clerk,"[-0.02651554, -0...","[0.09784252, -0....",0.107383,0.319914,0.107381
3,Accounts Manager,"[-0.05724597, -0...","[-0.15328938, -0...",-0.013595,0.344182,-0.013595
4,Accounts Staff,"[0.07755995, 0.1...","[0.06362797, 0.1...",-0.074016,0.440088,-0.074019


In [396]:
# Performe SVD to reduce to 2 dimensions
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2)
principal_components = svd.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
jobs_df['pc1_svd'] = principal_components[:, 0]
jobs_df['pc2_svd'] = principal_components[:, 1]

jobs_df.sort_values(by='pc1_svd', ascending=True).head(20)

Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,single_component,pc1_svd,pc2_svd
629,Orthopaedic,"[0.046270955, 0....","[-0.22345452, 0....",4.423442,0.096471,4.423438,-0.015504,0.71078
675,Physiotherapist,"[0.017191887, 0....","[0.026696831, 0....",3.880497,0.345569,3.880497,0.623109,0.922384
636,Paediatrician,"[-0.042912923, -...","[-0.24091673, -0...",3.135082,-0.061152,3.135082,1.327702,0.99776
632,Outdoor Pursuits,"[-0.50267744, 0....","[0.5155558, 0.28...",2.798404,0.039536,2.798404,1.480638,0.132884
1003,Technical Co-ord...,"[-0.10452112, -0...","[-0.46959028, -0...",2.820446,-0.113952,2.820448,1.608433,0.777323
378,Hypnotherapist,"[-0.07441852, 0....","[-0.08048147, 0....",2.913041,0.092675,2.913041,1.677707,1.110435
40,Anaesthetist,"[-0.15028015, 0....","[-0.09870565, 0....",2.66778,-0.229752,2.667776,1.715765,0.624642
444,Law Clerk,"[0.05829737, -0....","[-0.11488726, -0...",2.588254,0.30137,2.588255,1.746751,0.258873
749,Project Co-ordin...,"[-0.25341398, -0...","[-0.4698851, -0....",2.642978,-0.278272,2.642978,1.808709,0.822278
1062,Training Co-ordi...,"[-0.031270064, -...","[-0.52596426, -0...",2.532621,-0.2562,2.532621,1.92417,0.935504


In [404]:
from sklearn.manifold import Isomap

isomap = Isomap(n_components=2)
principal_components = isomap.fit_transform(n_feature_embeddings)

jobs_df['pc1_isomap'] = principal_components[:, 0]
jobs_df['pc2_isomap'] = principal_components[:, 1]

jobs_df.head()


Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,single_component,pc1_svd,pc2_svd,pc1_tsne,pc2_tsne,pc1_isomap,pc2_isomap
0,Accountant,"[0.044247508, -0...","[-0.19978876, -0...",-0.239164,0.248409,-0.239163,4.537879,-0.090603,-6.045249,20.296789,-0.22704,0.867494
1,Accounts Assistant,"[0.06723147, -0....","[0.044193994, -0...",-0.108442,0.405987,-0.108442,4.441882,0.064759,-2.344534,14.962835,-0.350741,0.697521
2,Accounts Clerk,"[-0.02651554, -0...","[0.09784252, -0....",0.107383,0.319914,0.107381,4.225747,0.112226,-2.241383,16.208851,0.065738,0.641266
3,Accounts Manager,"[-0.05724597, -0...","[-0.15328938, -0...",-0.013595,0.344182,-0.013595,4.364539,0.124592,-1.981782,15.283423,-0.190069,0.868234
4,Accounts Staff,"[0.07755995, 0.1...","[0.06362797, 0.1...",-0.074016,0.440088,-0.074019,4.388669,-0.021977,-3.004037,15.687756,-0.385267,0.796602


In [405]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
principal_components = tsne.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
jobs_df['pc1_tsne'] = principal_components[:, 0]
jobs_df['pc2_tsne'] = principal_components[:, 1]

jobs_df.sort_values(by='pc1_tsne', ascending=True).head(20)

Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,single_component,pc1_svd,pc2_svd,pc1_tsne,pc2_tsne,pc1_isomap,pc2_isomap
125,Brewer,"[-0.20677981, 0....","[0.14855385, 0.2...",-0.508354,-0.118541,-0.508357,4.74898,-0.15373,-46.925892,-3.865533,-2.319584,-0.026867
717,Potter,"[-0.16275418, 0....","[-0.114286214, 0...",-0.647138,-0.051848,-0.647138,4.893104,-0.124187,-46.793495,-5.083681,-1.905435,0.004119
711,Porter,"[-0.116864845, 0...","[-0.1459898, 0.0...",-0.646137,-0.067348,-0.64614,4.931566,-0.069645,-46.371677,-5.913837,-2.229104,-0.269318
86,Baker,"[-0.13666531, 0....","[0.03630696, 0.1...",-0.518459,-0.080683,-0.518464,4.780116,-0.188738,-46.319221,-4.194896,-1.887623,0.043608
858,Saw Miller,"[0.085206546, 0....","[-0.052450217, 0...",-0.799317,-0.066832,-0.799324,5.032082,-0.233603,-46.138741,-6.887968,-1.749146,-0.337027
287,Gardener,"[-0.030585885, 0...","[0.08608325, 0.0...",-0.572441,0.520028,-0.572442,4.752797,-0.459971,-45.842972,5.805202,-0.884309,0.853499
591,Nanny,"[0.02293092, -0....","[0.21600929, -0....",-0.258633,0.68377,-0.258633,4.422135,-0.362191,-45.599995,6.865522,-0.341183,0.95593
150,Butler,"[-0.27370402, 0....","[-0.058345884, 0...",-0.63027,0.077361,-0.630272,4.82591,-0.394094,-45.299191,-4.723615,-1.868098,0.113091
104,Bill Poster,"[0.22004102, 0.0...","[-0.25430095, 0....",-0.660824,-0.035492,-0.660828,4.852094,-0.40699,-45.113091,-6.012775,-1.92652,0.105984
883,Shepherd,"[-0.08332963, 0....","[0.045657, 0.130...",-0.790265,0.289838,-0.790269,5.020138,-0.398599,-45.077026,-4.006342,-1.868978,-0.135112


In [409]:
from sklearn.manifold import MDS

mds = MDS(n_components=2, random_state=42)
principal_components = mds.fit_transform(n_feature_embeddings)

jobs_df['pc1_mds'] = principal_components[:, 0]
jobs_df['pc2_mds'] = principal_components[:, 1]

jobs_df.head()


Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,single_component,pc1_svd,pc2_svd,pc1_tsne,pc2_tsne,pc1_isomap,pc2_isomap,pc1_mds,pc2_mds
0,Accountant,"[0.044247508, -0...","[-0.19978876, -0...",-0.239164,0.248409,-0.239163,4.537879,-0.090603,-6.045249,20.296789,-0.22704,0.867494,0.198056,-0.278411
1,Accounts Assistant,"[0.06723147, -0....","[0.044193994, -0...",-0.108442,0.405987,-0.108442,4.441882,0.064759,-2.344534,14.962835,-0.350741,0.697521,0.608928,-0.241264
2,Accounts Clerk,"[-0.02651554, -0...","[0.09784252, -0....",0.107383,0.319914,0.107381,4.225747,0.112226,-2.241383,16.208851,0.065738,0.641266,0.563273,0.099349
3,Accounts Manager,"[-0.05724597, -0...","[-0.15328938, -0...",-0.013595,0.344182,-0.013595,4.364539,0.124592,-1.981782,15.283423,-0.190069,0.868234,0.669417,-0.119278
4,Accounts Staff,"[0.07755995, 0.1...","[0.06362797, 0.1...",-0.074016,0.440088,-0.074019,4.388669,-0.021977,-3.004037,15.687756,-0.385267,0.796602,0.901786,-0.33744


In [410]:
import plotly.express as px
# pcs = ['pc1_tsne', 'pc2_tsne']
# pcs = ['pc1_isomap', 'pc2_isomap']
# pcs = ['pc1_svd', 'pc2_svd']
# pcs = ['pc1', 'pc2']
pcs = ['pc1_mds', 'pc2_mds']

fig = px.scatter(jobs_df, x=pcs[0], y=pcs[1], hover_name='Occupations',
                 title='Interactive Map of Job Titles',
                 labels={'pc1': 'Principal Component 1', 'pc2': 'Principal Component 2'})

# Update layout to improve the appearance
fig.update_layout(title='Interactive Job Titles Map',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  hovermode='closest')

# Show the plot
fig.show()

In [411]:
n = 10

top_n_desc_pc1 = jobs_df.nlargest(n, pcs[0]).Occupations.values
top_n_asc_pc1 = jobs_df.nsmallest(n, pcs[0]).Occupations.values
top_n_desc_pc2 = jobs_df.nlargest(n, pcs[1]).Occupations.values
top_n_asc_pc2 = jobs_df.nsmallest(n, pcs[1]).Occupations.values

top_n_desc_pc1_values = jobs_df.nlargest(n, pcs[0])[f'{pcs[0]}'].values
top_n_asc_pc1_values = jobs_df.nsmallest(n, pcs[0])[f'{pcs[0]}'].values
top_n_desc_pc2_values = jobs_df.nlargest(n, pcs[1])[f'{pcs[1]}'].values
top_n_asc_pc2_values = jobs_df.nsmallest(n, pcs[1])[f'{pcs[1]}'].values

table = pd.DataFrame({
    'Top 20 Descending PC1': top_n_desc_pc1,
    'PC1_desc': top_n_desc_pc1_values,
    'Top 20 Ascending PC1': top_n_asc_pc1,
    'PC1_asc': top_n_asc_pc1_values,
    'Top 20 Descending PC2': top_n_desc_pc2,
    'PC2_desc': top_n_desc_pc2_values,
    'Top 20 Ascending PC2': top_n_asc_pc2,
    'PC2_asc': top_n_asc_pc2_values
})

table

Unnamed: 0,Top 20 Descending PC1,PC1_desc,Top 20 Ascending PC1,PC1_asc,Top 20 Descending PC2,PC2_desc,Top 20 Ascending PC2,PC2_asc
0,Physiotherapist,1.770481,Nanny,-1.245327,Orthopaedic,4.474294,Van Driver,-1.247745
1,Travel Represent...,1.757049,Lavatory Attendant,-1.143542,Physiotherapist,3.567505,Shepherd,-1.239356
2,Local Government,1.521821,Hot Foil Printer,-1.128218,Outdoor Pursuits,3.178159,Driver,-1.229689
3,Special Needs,1.506704,Lampshade Maker,-1.093377,Paediatrician,3.038462,Tanner,-1.220778
4,Remedial Therapist,1.480227,Manufacturing,-1.062038,Technical Co-ord...,2.901367,Recreational,-1.20379
5,Hypnotherapist,1.429451,Joinery Consultant,-1.046688,Project Co-ordin...,2.842088,Gardener,-1.198772
6,Building Estimator,1.401315,Retired,-1.035588,Hypnotherapist,2.777588,Saw Miller,-1.17403
7,Sub-Postmistress,1.398579,Rigger,-1.033398,Anaesthetist,2.767946,Miner,-1.171901
8,Home Economist,1.366468,Partition Erector,-1.03293,Training Co-ordi...,2.717052,Nun,-1.168702
9,Palaeobotanist,1.347426,Temperature Time,-1.029732,Locum Pharmacist,2.572482,Bill Poster,-1.151454


In [375]:
from gensim.models import Word2Vec

# Load the pre-trained Word2Vec model
word2vec_model = Word2Vec.load('GoogleNews-vectors-negative300.bin', binary=True)

# Tokenize Gendered Words
text = gendered_words.word.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

# Convert word embeddings to Word2Vec format
word2vec_embeddings = []
for word_embedding in word_embeddings:
    word2vec_embeddings.append(word2vec_model.wv[word_embedding])

gendered_words['word2vec_embedding'] = word2vec_embeddings

# Normalize the Word2Vec embeddings
x = preprocessing.normalize(gendered_words.word2vec_embedding.tolist(), norm='l1')
x

TypeError: load() got an unexpected keyword argument 'binary'