In [2]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import torch

import sys
sys.path.append('/home/nauel/bert_gender_bias')

from pipelines.utils.paths import EXTERNAL_DATA_DIR, INTERIM_DATA_DIR

In [3]:
gendered_words = pd.read_csv(os.path.join(INTERIM_DATA_DIR, 'gendered_words.csv'), sep="|")
jobs_df = pd.read_csv(os.path.join(INTERIM_DATA_DIR, 'jobs.csv'), sep="|")
print(jobs_df.head())
print(gendered_words.head())

  job_title_clean  n_tokens
0        pressman         1
1          dealer         1
2        animator         1
3          artist         1
4        designer         1
  gender      word  gender_binary
0   male     actor              0
1   male     uncle              0
2   male  stewards              0
3   male   steward              0
4   male  stepsons              0


# Tokenize Gendered Words

In [4]:
# apply bert tokenizer to the words and create another col for it
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

text = gendered_words.word.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

gendered_words['bert_token'] = list(word_embeddings)




In [89]:
gendered_words

Unnamed: 0,gender,word,gender_binary,bert_token
0,male,actor,0,"[0.12366218, -0.082570516, -0.5964365, -0.1320..."
1,male,uncle,0,"[-0.2948148, -0.008699457, 0.24116987, 0.03704..."
2,male,stewards,0,"[0.102615416, -0.08765912, -0.4362369, 0.11619..."
3,male,steward,0,"[-0.16279216, 0.09167243, -0.04716784, 0.15830..."
4,male,stepsons,0,"[-0.298284, -0.037682097, 0.069945924, -0.0896..."
...,...,...,...,...
251,female,wife,1,"[-0.18965541, -0.15518133, 0.27795383, 0.02582..."
252,female,wives,1,"[-0.108947404, -0.061857525, 0.2810563, 0.1402..."
253,female,queen,1,"[-0.25403914, -0.09005203, 0.38148656, -0.0522..."
254,female,widow,1,"[-0.12043834, -0.021472009, 0.2805542, 0.08822..."


In [74]:
from sklearn import preprocessing
x = preprocessing.normalize(gendered_words.bert_token.tolist(), norm='l1')
x

array([[ 7.09431536e-04, -4.73694764e-04, -3.42166747e-03, ...,
        -1.14786315e-04,  2.39949852e-04, -1.87563726e-03],
       [-1.85148244e-03, -5.46339342e-05,  1.51458402e-03, ...,
         2.88693343e-04, -5.18714208e-04,  5.49899619e-04],
       [ 5.74427179e-04, -4.90703865e-04, -2.44199492e-03, ...,
        -9.90043819e-04, -8.27130729e-05,  3.15594287e-04],
       ...,
       [-1.57124018e-03, -5.56974685e-04,  2.35950658e-03, ...,
        -1.02800749e-03, -1.18215474e-04, -2.54283076e-04],
       [-6.69012645e-04, -1.19273030e-04,  1.55842662e-03, ...,
        -1.31851596e-04,  1.36140985e-04, -1.81286383e-04],
       [ 4.56438413e-04,  1.83403128e-04, -1.37154634e-03, ...,
        -1.70031898e-03,  5.48981786e-04, -1.83965095e-03]])

# Linear SVM 

In [55]:
import sklearn

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = np.array(preprocessing.normalize(gendered_words.bert_token.tolist(), norm='l2'))
y = np.array(gendered_words.gender_binary.tolist())
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5))
clf.fit(X, y)




Liblinear failed to converge, increase the number of iterations.



In [76]:
def get_top_features(clf):
    svc = clf.named_steps['linearsvc']

    coefficients = svc.coef_
    intercept = svc.intercept_

    flat_coefficients = coefficients.flatten()

    coef_df = pd.DataFrame(flat_coefficients, columns=['coefficient'])
    coef_df.index.name = 'feature'
    coef_df.reset_index(inplace=True)
    coef_df['absolute_coefficient'] = np.abs(coef_df['coefficient'])
    coef_df_sorted = coef_df.sort_values(by='absolute_coefficient', ascending=False)
    
    return coef_df_sorted

coef_df_sorted = get_top_features(clf)
coef_df_sorted.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
17,17,0.060232,0.060232
607,607,-0.059782,0.059782
194,194,-0.059779,0.059779
748,748,-0.057675,0.057675
508,508,-0.055446,0.055446
417,417,-0.051802,0.051802
481,481,0.05031,0.05031
223,223,-0.048984,0.048984
243,243,-0.048155,0.048155
606,606,0.047758,0.047758


In [77]:
def get_top_features(clf):
    svc = clf.named_steps['linearsvc']

    coefficients = svc.coef_
    intercept = svc.intercept_

    flat_coefficients = coefficients.flatten()

    coef_df = pd.DataFrame(flat_coefficients, columns=['coefficient'])
    coef_df.index.name = 'feature'
    coef_df.reset_index(inplace=True)
    coef_df['absolute_coefficient'] = np.abs(coef_df['coefficient'])
    coef_df_sorted = coef_df.sort_values(by='absolute_coefficient', ascending=False)
    
    return coef_df_sorted

coef_df_sorted = get_top_features(clf)
coef_df_sorted.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
17,17,0.060232,0.060232
607,607,-0.059782,0.059782
194,194,-0.059779,0.059779
748,748,-0.057675,0.057675
508,508,-0.055446,0.055446
417,417,-0.051802,0.051802
481,481,0.05031,0.05031
223,223,-0.048984,0.048984
243,243,-0.048155,0.048155
606,606,0.047758,0.047758


In [83]:
n_features = 20

selected_features = coef_df_sorted.feature[:n_features].values
selected_features

array([ 17, 607, 194, 748, 508, 417, 481, 223, 243, 606, 753, 187, 734,
        49,  71, 521, 367, 155, 267, 423])

# Tokenize Jobs

In [84]:
text = jobs_df.job_title_clean.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

In [85]:
n_feature_embeddings = word_embeddings[:, selected_features]

jobs_df['bert_token'] = list(n_feature_embeddings)

# Perform PCA

In [86]:
from sklearn.decomposition import PCA

# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
principal_components = pca.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
jobs_df['pc1'] = principal_components[:, 0]
jobs_df['pc2'] = principal_components[:, 1]

jobs_df.sort_values(by='pc1', ascending=True).head(20)
jobs_df[jobs_df['job_title_clean']=='nurse']

Unnamed: 0,job_title_clean,n_tokens,bert_token,pc1,pc2
1126,nurse,1,"[0.20660377, -0.041388858, 0.05775149, -0.1323...",-0.217597,-0.167943


In [87]:
import plotly.express as px

fig = px.scatter(jobs_df, x='pc1', y='pc2', hover_name='job_title_clean',
                 title='Interactive Map of Job Titles',
                 labels={'pc1': 'Principal Component 1', 'pc2': 'Principal Component 2'})

# Update layout to improve the appearance
fig.update_layout(title='Interactive Job Titles Map',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  hovermode='closest')

# Show the plot
fig.show()

In [88]:
top_20_desc_pc1 = jobs_df.nlargest(20, 'pc1').job_title_clean.values
top_20_asc_pc1 = jobs_df.nsmallest(20, 'pc1').job_title_clean.values
top_20_desc_pc2 = jobs_df.nlargest(20, 'pc2').job_title_clean.values
top_20_asc_pc2 = jobs_df.nsmallest(20, 'pc2').job_title_clean.values

table = pd.DataFrame({
    'Top 20 Descending PC1': top_20_desc_pc1,
    'Top 20 Ascending PC1': top_20_asc_pc1,
    'Top 20 Descending PC2': top_20_desc_pc2,
    'Top 20 Ascending PC2': top_20_asc_pc2,
})

table

Unnamed: 0,Top 20 Descending PC1,Top 20 Ascending PC1,Top 20 Descending PC2,Top 20 Ascending PC2
0,informatica,plumber,decontaminator,farmhand
1,immunohematologist,quarryman,tagman,biologist
2,precipitator,hoseman,sodder,medical
3,bioinformatician,scrapper,moshgiach,researcher
4,bioinformaticist,babbitter,pourer,health
5,immunology,steelworker,chha,scientist
6,anaesthesiologist,bricklayer,carburizer,accountant
7,sourcer,seamer,crtt,marketing
8,host,tankerman,keypuncher,florist
9,neurosurgeon,laceworker,lusterer,reporter
