In [475]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import torch

import sys
sys.path.append('/home/nauel/bert_gender_bias')
import warnings
warnings.filterwarnings("ignore")


from pipelines.utils.paths import EXTERNAL_DATA_DIR, INTERIM_DATA_DIR

In [476]:
gendered_words = pd.read_csv(os.path.join(INTERIM_DATA_DIR, 'gender_binary_words.csv'), sep="|")
jobs_df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'occupations.csv'), sep="|")
print(jobs_df.head())
print(gendered_words.head())

          Occupations
0          Accountant
1  Accounts Assistant
2      Accounts Clerk
3    Accounts Manager
4      Accounts Staff
         word  gender_binary
0       abbot              0
1      abbots              0
2   adulterer              0
3  adulterers              0
4      airman              0


# Tokenize Gendered Words - BERT

In [477]:
# apply bert tokenizer to the words and create another col for it
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

text = gendered_words.word.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

gendered_words['bert_token'] = list(word_embeddings)




# Tokenize gendered words - Word2Vec

In [478]:
from gensim.models import KeyedVectors

# Specify the path to the pre-trained Word2Vec model
MODEL_PATH = '/home/nauel/bert_gender_bias/data/0_external/GoogleNews-vectors-negative300.bin'

# Load the model using KeyedVectors
word2vec_model = KeyedVectors.load_word2vec_format(MODEL_PATH, binary=True)

# Generate tokens with Word2Vec for the jobs dataset
gendered_words['word2vec_token'] = gendered_words['word'].apply(lambda x: word2vec_model[x] if x in word2vec_model else None)

In [479]:
gendered_words = gendered_words.dropna(subset=['word2vec_token'])
gendered_words

Unnamed: 0,word,gender_binary,bert_token,word2vec_token
0,abbot,0,"[-0.39571184, -0...","[0.40039062, 0.4..."
1,abbots,0,"[-0.18790531, -0...","[0.30664062, 0.3..."
2,adulterer,0,"[-0.45352724, -0...","[0.42773438, -0...."
3,adulterers,0,"[-0.17249976, -0...","[-0.14160156, -0..."
4,airman,0,"[-0.20044291, -0...","[0.49023438, -0...."
...,...,...,...,...
510,woman,1,"[-0.34769166, -0...","[0.24316406, -0...."
511,womankind,1,"[-0.24836135, -0...","[-0.096191406, -..."
512,womanly,1,"[-0.3582225, -0....","[0.01965332, -0...."
513,womanpower,1,"[-0.042981803, -...","[-0.0041503906, ..."


# Linear SVM 

In [480]:
import sklearn

from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = np.array(gendered_words.word2vec_token.tolist())
y = np.array(gendered_words.gender_binary.tolist())
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5))
clf.fit(X, y)

print("Accuracy: ", clf.score(X, y))


Accuracy:  0.9976798143851509


In [481]:
def get_top_features(clf):
    svc = clf.named_steps['linearsvc']

    coefficients = svc.coef_
    intercept = svc.intercept_

    flat_coefficients = coefficients.flatten()

    coef_df = pd.DataFrame(flat_coefficients, columns=['coefficient'])
    coef_df.index.name = 'feature'
    coef_df.reset_index(inplace=True)
    coef_df['absolute_coefficient'] = np.abs(coef_df['coefficient'])
    svm_coef = coef_df.sort_values(by='absolute_coefficient', ascending=False)
    
    return svm_coef

svm_coef = get_top_features(clf)
svm_coef.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
202,202,0.244879,0.244879
152,152,0.221939,0.221939
243,243,-0.221295,0.221295
149,149,-0.211997,0.211997
282,282,-0.2081,0.2081
178,178,0.200955,0.200955
119,119,-0.1975,0.1975
101,101,-0.194454,0.194454
160,160,0.193501,0.193501
256,256,-0.191224,0.191224


In [482]:
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

# Fit logistic regression with L1 regularization
logit = LogisticRegression(solver="liblinear", penalty="l1")
logit.fit(X, y)
print("Accuracy: ", logit.score(X, y))

Accuracy:  0.9651972157772621


In [483]:
# Get the coefficients
coefficients = logit.coef_[0]
coefficients_abs = np.abs(coefficients)

logit_coef = pd.DataFrame({'feature': range(len(coefficients)), 'coefficient': coefficients, 'absolute_coefficient': coefficients_abs})
logit_coef = logit_coef.sort_values(by='absolute_coefficient', ascending=False)
logit_coef = logit_coef[logit_coef.absolute_coefficient > 0]
logit_coef.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
152,152,4.21372,4.21372
192,192,3.231172,3.231172
221,221,3.157689,3.157689
139,139,-2.488401,2.488401
154,154,2.474056,2.474056
149,149,-2.465861,2.465861
282,282,-2.337576,2.337576
79,79,2.214688,2.214688
137,137,-1.948616,1.948616
155,155,-1.647881,1.647881


In [484]:
def predict_gender(test_word, tokenizer, model, logit):
    test_word_encoded = tokenizer(test_word, return_tensors='pt', padding=True, truncation=True)
    
    with torch.no_grad():
        output = model(**test_word_encoded)
        
    test_word_embedding = output.last_hidden_state.mean(dim=1).numpy()
    prediction = logit.predict(test_word_embedding)
    
    prediction_dict = {0: 'male', 1: 'female'}
    
    predict_prob = logit.predict_proba(test_word_embedding)
    
    print("Word: ", test_word)
    print("Maleness coefficient: ", predict_prob[0][0])
    print("Femaleness coefficient: ", predict_prob[0][1])

predict_gender('student', tokenizer, model, logit)

ValueError: X has 768 features, but LogisticRegression is expecting 300 features as input.

## Using SVM coefficients

In [None]:
# n_features = 50
# selected_features = svm_coef.feature[:n_features].values
# selected_features

array([286, 251, 521, 636, 592, 500, 464, 708, 404, 525, 417, 276, 237,
        52, 342, 373, 141,  15, 253, 258, 670, 394,  74, 374, 741, 478,
       138, 618, 248, 247, 102, 756,  90, 710, 353, 575, 702, 319, 260,
       443,  67, 481, 406, 448, 532, 208, 514, 393, 503, 492])

## Using Logistic Regression Coefficients

In [495]:
selected_features = logit_coef.feature.values
selected_features

array([152, 192, 221, 139, 154, 149, 282,  79, 137, 155, 289, 294, 133,
       125, 243,  80, 201,  40, 202, 119, 261, 188, 242, 296, 107, 298,
        33,  99,  91,  96,  50, 297,   9, 168,  54, 214, 218, 220, 117,
        74,  85, 275,  66,  49, 114, 167, 113,  58, 122,  38, 115, 162,
       213,  31,  30, 272, 299, 210,  14,  75, 239, 145, 291, 170,  19,
       147])

# Tokenize Jobs

In [496]:
text = jobs_df.Occupations.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

n_feature_embeddings = word_embeddings[:, selected_features]
n_feature_embeddings

array([[-0.3130405 , -0.6442018 , -0.1719227 , ..., -0.4128251 ,
        -0.16746786,  0.28646576],
       [-0.17342341, -0.23936778, -0.11509642, ..., -0.43037054,
        -0.33319804,  0.25351226],
       [-0.01937636, -0.30707565, -0.1506607 , ..., -0.36524117,
        -0.3494278 ,  0.27791062],
       ...,
       [-0.20818092, -0.19345118,  0.14640152, ..., -0.5612403 ,
        -0.10042609,  0.10388439],
       [-0.18364595, -0.37938988,  0.04717118, ..., -0.5459751 ,
        -0.05600512,  0.22418225],
       [-0.38795102, -0.4015171 ,  0.03545552, ..., -0.52952063,
         0.1475778 ,  0.17352837]], dtype=float32)

In [497]:
jobs_df['bert_token'] = list(word_embeddings)
jobs_df['n_feature_embeddings'] = list(n_feature_embeddings)

# Perform PCA

In [498]:
from sklearn.decomposition import PCA

# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
principal_components = pca.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
jobs_df['pc1'] = principal_components[:, 0]
jobs_df['pc2'] = principal_components[:, 1]

jobs_df.head()

Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,pc1_svd,pc2_svd,pc1_isomap,pc2_isomap,pc1_tsne,pc2_tsne,pc1_mds,pc2_mds
0,Accountant,"[0.044247508, -0...","[-0.3130405, -0....",-0.667979,0.442955,4.537879,-0.090671,-0.22704,0.867494,-6.045249,20.296789,0.198056,-0.278411
1,Accounts Assistant,"[0.06723147, -0....","[-0.17342341, -0...",-0.224419,0.246803,4.441882,0.064551,-0.350742,0.697521,-2.344534,14.962835,0.608928,-0.241264
2,Accounts Clerk,"[-0.02651554, -0...","[-0.019376364, -...",-0.137285,0.398781,4.225747,0.111996,0.065738,0.641266,-2.241383,16.208851,0.563273,0.099349
3,Accounts Manager,"[-0.05724597, -0...","[-0.14583607, -0...",-0.031491,0.222525,4.364539,0.124321,-0.190069,0.868234,-1.981782,15.283423,0.669417,-0.119278
4,Accounts Staff,"[0.07755995, 0.1...","[-0.03824952, 0....",-0.572433,0.110937,4.388669,-0.022252,-0.385267,0.796602,-3.004037,15.687756,0.901786,-0.33744


In [499]:
# Performe SVD to reduce to 2 dimensions
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2)
principal_components = svd.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
jobs_df['pc1_svd'] = principal_components[:, 0]
jobs_df['pc2_svd'] = principal_components[:, 1]

jobs_df.sort_values(by='pc1_svd', ascending=True).head(20)

Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,pc1_svd,pc2_svd,pc1_isomap,pc2_isomap,pc1_tsne,pc2_tsne,pc1_mds,pc2_mds
982,Tanner,"[-0.32705253, 0....","[-0.084228225, -...",-0.285781,-0.883754,0.654928,-0.431208,-1.080973,0.538974,-43.81813,2.15044,0.386667,-1.220778
902,Shot Blaster,"[-0.110063165, -...","[0.03911496, -0....",0.587934,-0.621423,0.686889,0.454566,-0.545594,-0.454357,-10.770896,-22.890379,-0.886715,-0.02829
433,Lampshade Maker,"[0.32886472, 0.0...","[-0.17451307, -0...",0.714483,-0.920511,0.696025,0.571863,-0.659023,0.162491,-8.546111,29.727943,-1.093377,0.130801
345,Home Help,"[0.14974697, -0....","[0.025889888, 0....",-0.172455,-0.949016,0.729951,-0.322465,-1.016315,-0.018735,-43.195755,-1.125274,0.097459,-0.768333
343,Hod Carrier,"[-0.021728573, 0...","[0.16973181, 0.1...",-0.222385,-0.983164,0.774932,-0.369473,0.702018,0.362211,32.718037,-9.75327,0.695858,0.547964
559,Meter Reader,"[-0.13549629, -0...","[-0.12808928, -0...",0.212122,-0.738937,0.827411,0.089508,-1.193058,-0.329026,-29.838167,-13.255314,-0.743664,-0.834017
710,Pools Collector,"[0.02780397, -0....","[-0.24109717, 0....",0.020352,-0.909422,0.830561,-0.105688,-1.508064,-0.093383,-39.519886,-5.505763,-0.278006,-1.007023
1101,Tyre Inspector,"[-0.1899545, -0....","[-0.26284605, 0....",-0.551932,-0.915412,0.84153,-0.683537,-1.554792,-0.134367,-32.144917,-2.989563,-0.266211,-1.080422
552,Messenger,"[0.11227638, -0....","[-0.3295644, -0....",-0.12695,-0.71027,0.845177,-0.251795,-1.25228,0.456063,-39.693432,2.069089,0.597562,-0.66426
525,Masseuse,"[-0.34479845, -0...","[-0.09635216, -0...",0.318682,-0.73663,0.846825,0.186995,-0.043533,0.356007,-42.387512,8.209256,0.97438,-0.383119


In [500]:
from sklearn.manifold import Isomap

isomap = Isomap(n_components=2)
principal_components = isomap.fit_transform(n_feature_embeddings)

jobs_df['pc1_isomap'] = principal_components[:, 0]
jobs_df['pc2_isomap'] = principal_components[:, 1]

jobs_df.head()


Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,pc1_svd,pc2_svd,pc1_isomap,pc2_isomap,pc1_tsne,pc2_tsne,pc1_mds,pc2_mds
0,Accountant,"[0.044247508, -0...","[-0.3130405, -0....",-0.667979,0.442955,2.193111,-0.574635,-2.730256,1.367201,-6.045249,20.296789,0.198056,-0.278411
1,Accounts Assistant,"[0.06723147, -0....","[-0.17342341, -0...",-0.224419,0.246803,1.573782,-0.22091,-1.738606,0.16994,-2.344534,14.962835,0.608928,-0.241264
2,Accounts Clerk,"[-0.02651554, -0...","[-0.019376364, -...",-0.137285,0.398781,1.745842,-0.103433,-0.963522,0.013279,-2.241383,16.208851,0.563273,0.099349
3,Accounts Manager,"[-0.05724597, -0...","[-0.14583607, -0...",-0.031491,0.222525,1.633837,-0.030033,-1.94678,0.9218,-1.981782,15.283423,0.669417,-0.119278
4,Accounts Staff,"[0.07755995, 0.1...","[-0.03824952, 0....",-0.572433,0.110937,1.501214,-0.58471,-1.561509,-0.394172,-3.004037,15.687756,0.901786,-0.33744


In [501]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
principal_components = tsne.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
jobs_df['pc1_tsne'] = principal_components[:, 0]
jobs_df['pc2_tsne'] = principal_components[:, 1]

jobs_df.sort_values(by='pc1_tsne', ascending=True).head(20)

Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,pc1_svd,pc2_svd,pc1_isomap,pc2_isomap,pc1_tsne,pc2_tsne,pc1_mds,pc2_mds
61,Artist,"[0.13974144, 0.1...","[-0.011005317, -...",-0.649953,-0.247619,1.815153,-0.635786,-1.078334,-0.688689,-41.666073,0.70512,0.424989,-0.862018
129,Broadcaster,"[-0.12483895, -0...","[-0.060281347, -...",-0.470038,0.259449,2.111368,-0.407111,-1.493296,-0.950941,-40.078125,1.516454,-0.861523,-0.229423
930,Sportsman,"[-0.10744917, 0....","[-0.09439827, -0...",-0.625401,0.298931,1.970581,-0.562144,-2.153754,-0.908366,-39.806179,-4.545262,0.602369,-0.463362
420,Jockey,"[-0.25613517, -0...","[-0.354601, -0.2...",-0.629705,0.107198,1.964541,-0.582588,-1.266314,-0.931924,-39.265171,-6.232038,-0.644111,-0.676448
755,Promoter,"[-0.0660972, -0....","[-0.36437982, -0...",-0.26534,0.121732,1.816271,-0.244183,-1.197222,-0.321415,-39.236115,2.026346,0.237313,-0.787952
828,Retired,"[-0.27229792, 0....","[-0.08788954, -0...",-0.775697,0.282937,1.636517,-0.756024,-2.387108,-0.416142,-37.960125,-1.11097,-1.035588,-0.625369
625,Organiser,"[-0.09420863, -0...","[-0.03841102, -0...",-0.489068,0.122899,1.68804,-0.481771,-1.579579,-0.369502,-37.398239,2.575751,0.288148,-0.193328
786,Racing Organiser,"[-0.0005795362, ...","[-0.09331068, -0...",-0.103949,-0.127022,1.369631,-0.134569,0.002397,-0.67849,-37.316998,2.442214,0.468343,-0.441936
1009,Technical Liaison,"[0.025975974, -0...","[-0.057822738, -...",-0.234151,0.023094,1.433389,-0.262737,-1.729768,1.047616,-36.968315,3.984808,0.236983,-0.554627
511,Marine Pilot,"[-0.28293496, 0....","[-0.06672393, -0...",-0.284167,0.539173,2.104402,-0.20076,-1.284958,0.959407,-34.761631,-5.813978,0.313714,0.264263


In [502]:
from sklearn.manifold import MDS

mds = MDS(n_components=2, random_state=42)
principal_components = mds.fit_transform(n_feature_embeddings)

jobs_df['pc1_mds'] = principal_components[:, 0]
jobs_df['pc2_mds'] = principal_components[:, 1]

jobs_df.head()


Unnamed: 0,Occupations,bert_token,n_feature_embeddings,pc1,pc2,pc1_svd,pc2_svd,pc1_isomap,pc2_isomap,pc1_tsne,pc2_tsne,pc1_mds,pc2_mds
0,Accountant,"[0.044247508, -0...","[-0.3130405, -0....",-0.667979,0.442955,2.193111,-0.574635,-2.730256,1.367201,-19.408764,29.701332,-1.186377,-0.158196
1,Accounts Assistant,"[0.06723147, -0....","[-0.17342341, -0...",-0.224419,0.246803,1.573782,-0.22091,-1.738606,0.16994,-18.387089,5.144744,-0.769113,0.437347
2,Accounts Clerk,"[-0.02651554, -0...","[-0.019376364, -...",-0.137285,0.398781,1.745842,-0.103433,-0.963522,0.013279,-17.584682,2.039425,-0.545254,0.544086
3,Accounts Manager,"[-0.05724597, -0...","[-0.14583607, -0...",-0.031491,0.222525,1.633837,-0.030033,-1.94678,0.9218,-18.322716,5.878531,-0.700004,0.885252
4,Accounts Staff,"[0.07755995, 0.1...","[-0.03824952, 0....",-0.572433,0.110937,1.501214,-0.58471,-1.561509,-0.394172,-17.264246,4.431545,-1.452816,-0.15325


In [503]:
import plotly.express as px
# pcs = ['pc1_tsne', 'pc2_tsne']
# pcs = ['pc1_isomap', 'pc2_isomap']
# pcs = ['pc1_svd', 'pc2_svd']
pcs = ['pc1', 'pc2']
# pcs = ['pc1_mds', 'pc2_mds']

fig = px.scatter(jobs_df, x=pcs[0], y=pcs[1], hover_name='Occupations',
                 title='Interactive Map of Job Titles',
                 labels={'pc1': 'Principal Component 1', 'pc2': 'Principal Component 2'})

# Update layout to improve the appearance
fig.update_layout(title='Interactive Job Titles Map',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  hovermode='closest')

# Show the plot
fig.show()

In [504]:
n = 10

top_n_desc_pc1 = jobs_df.nlargest(n, pcs[0]).Occupations.values
top_n_asc_pc1 = jobs_df.nsmallest(n, pcs[0]).Occupations.values
top_n_desc_pc2 = jobs_df.nlargest(n, pcs[1]).Occupations.values
top_n_asc_pc2 = jobs_df.nsmallest(n, pcs[1]).Occupations.values

top_n_desc_pc1_values = jobs_df.nlargest(n, pcs[0])[f'{pcs[0]}'].values
top_n_asc_pc1_values = jobs_df.nsmallest(n, pcs[0])[f'{pcs[0]}'].values
top_n_desc_pc2_values = jobs_df.nlargest(n, pcs[1])[f'{pcs[1]}'].values
top_n_asc_pc2_values = jobs_df.nsmallest(n, pcs[1])[f'{pcs[1]}'].values

table = pd.DataFrame({
    'Top 20 Descending PC1': top_n_desc_pc1,
    'PC1_desc': top_n_desc_pc1_values,
    'Top 20 Ascending PC1': top_n_asc_pc1,
    'PC1_asc': top_n_asc_pc1_values,
    'Top 20 Descending PC2': top_n_desc_pc2,
    'PC2_desc': top_n_desc_pc2_values,
    'Top 20 Ascending PC2': top_n_asc_pc2,
    'PC2_asc': top_n_asc_pc2_values
})

table

Unnamed: 0,Top 20 Descending PC1,PC1_desc,Top 20 Ascending PC1,PC1_asc,Top 20 Descending PC2,PC2_desc,Top 20 Ascending PC2,PC2_asc
0,Locum Pharmacist,1.96014,Operative,-1.204628,Assistant Caretaker,1.366243,Stone Sawyer,-1.090805
1,Physiotherapist,1.677859,Groom,-1.171171,Technical Manager,1.296078,Artexer,-1.046436
2,Orthopaedic,1.645753,Investigator,-1.143287,Revenue Officer,1.242021,Welder,-0.986137
3,Anaesthetist,1.573465,Training Officer,-1.099331,Orthopaedic,1.103055,Spring Maker,-0.985359
4,Hypnotherapist,1.499657,Personnel Officer,-1.068322,Technical Engineer,1.087001,Hod Carrier,-0.983164
5,Paediatrician,1.496329,Warden,-1.047771,Chartered Accoun...,1.078141,Furnace Man,-0.973146
6,Injection Moulder,1.464429,Clergyman,-1.035585,Administration C...,1.054461,Leaflet Distributor,-0.953622
7,Aromatherapist,1.460943,Recreational,-1.022603,Trainee Manager,1.02127,Home Help,-0.949016
8,Palaeobotanist,1.439885,Secretary,-0.994715,Civil Servant,1.012483,Warehousewoman,-0.932252
9,Machine Tool Fitter,1.417398,Cleaner,-0.99306,Anaesthetist,1.009604,Lampshade Maker,-0.920511


In [505]:
from gensim.models import KeyedVectors

# Specify the path to the pre-trained Word2Vec model
MODEL_PATH = '/home/nauel/bert_gender_bias/data/0_external/GoogleNews-vectors-negative300.bin'

# Load the model using KeyedVectors
word2vec_model = KeyedVectors.load_word2vec_format(MODEL_PATH, binary=True)

# Generate tokens with Word2Vec for the jobs dataset
jobs_df['word2vec_token'] = jobs_df['Occupations'].apply(lambda x: word2vec_model[x] if x in word2vec_model else None)


In [None]:
jobs_df['word_count'] = jobs_df['Occupations'].astype(str).apply(lambda x: len(x.split()))
jobs_df[jobs_df['word_count'] == 1].shape

(378, 16)

In [None]:
jobs_df['Occupations'].astype(str).apply(len)

0       10
1       18
2       14
3       16
4       14
        ..
1150    19
1151    12
1152    10
1153    11
1154     9
Name: Occupations, Length: 1155, dtype: int64