In [26]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import torch

import sys
sys.path.append('/home/nauel/bert_gender_bias')

from pipelines.utils.paths import EXTERNAL_DATA_DIR, INTERIM_DATA_DIR

In [27]:
gendered_words = pd.read_csv(os.path.join(INTERIM_DATA_DIR, 'gendered_words.csv'), sep="|")
jobs_df = pd.read_csv(os.path.join(INTERIM_DATA_DIR, 'jobs.csv'), sep="|")
print(jobs_df.head())
print(gendered_words.head())

  job_title_clean  n_tokens
0        pressman         1
1          dealer         1
2        animator         1
3          artist         1
4        designer         1
  gender      word  gender_binary
0   male     actor              0
1   male     uncle              0
2   male  stewards              0
3   male   steward              0
4   male  stepsons              0


# Tokenize Gendered Words

In [28]:
# apply bert tokenizer to the words and create another col for it
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

text = gendered_words.word.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

gendered_words['bert_token'] = list(word_embeddings)




In [29]:
gendered_words.head()

Unnamed: 0,gender,word,gender_binary,bert_token
0,male,actor,0,"[0.12366218, -0.082570516, -0.5964365, -0.1320..."
1,male,uncle,0,"[-0.2948148, -0.008699457, 0.24116987, 0.03704..."
2,male,stewards,0,"[0.102615416, -0.08765912, -0.4362369, 0.11619..."
3,male,steward,0,"[-0.16279216, 0.09167243, -0.04716784, 0.15830..."
4,male,stepsons,0,"[-0.298284, -0.037682097, 0.069945924, -0.0896..."


# Linear SVM 

In [30]:
import sklearn

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = np.array(gendered_words.bert_token.tolist())
y = np.array(gendered_words.gender_binary.tolist())
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5))
clf.fit(X, y)




Liblinear failed to converge, increase the number of iterations.



In [31]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [32]:
def get_top_features(clf):
    svc = clf.named_steps['linearsvc']

    coefficients = svc.coef_
    intercept = svc.intercept_

    flat_coefficients = coefficients.flatten()

    coef_df = pd.DataFrame(flat_coefficients, columns=['coefficient'])
    coef_df.index.name = 'feature'
    coef_df.reset_index(inplace=True)
    coef_df['absolute_coefficient'] = np.abs(coef_df['coefficient'])
    coef_df_sorted = coef_df.sort_values(by='absolute_coefficient', ascending=False)
    
    return coef_df_sorted

coef_df_sorted = get_top_features(clf)
coef_df_sorted.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
607,607,-0.064116,0.064116
17,17,0.063558,0.063558
748,748,-0.06018,0.06018
508,508,-0.058335,0.058335
194,194,-0.057867,0.057867
417,417,-0.05389,0.05389
606,606,0.052119,0.052119
481,481,0.051818,0.051818
223,223,-0.051593,0.051593
243,243,-0.050561,0.050561


In [43]:
n_features = 5

selected_features = coef_df_sorted.feature[:n_features].values
selected_features

array([607,  17, 748, 508, 194])

# Tokenize Jobs

In [44]:
text = jobs_df.job_title_clean.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

In [45]:
n_feature_embeddings = word_embeddings[:, selected_features]

jobs_df['bert_token'] = list(n_feature_embeddings)

# Perform PCA

In [46]:
from sklearn.decomposition import PCA

# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
principal_components = pca.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
jobs_df['pc1'] = principal_components[:, 0]
jobs_df['pc2'] = principal_components[:, 1]

jobs_df.sort_values(by='pc1', ascending=True).head(20)
jobs_df[jobs_df['job_title_clean']=='nurse']

Unnamed: 0,job_title_clean,n_tokens,bert_token,pc1,pc2
1126,nurse,1,"[-0.041388858, 0.20660377, -0.13233012, -0.133...",0.010076,-0.197336


In [47]:
import plotly.express as px

fig = px.scatter(jobs_df, x='pc1', y='pc2', hover_name='job_title_clean',
                 title='Interactive Map of Job Titles',
                 labels={'pc1': 'Principal Component 1', 'pc2': 'Principal Component 2'})

# Update layout to improve the appearance
fig.update_layout(title='Interactive Job Titles Map',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  hovermode='closest')

# Show the plot
fig.show()