In [2]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import torch

import sys
sys.path.append('/home/nauel/bert_gender_bias')

from pipelines.utils.paths import EXTERNAL_DATA_DIR

In [3]:
#read the txt files gendered_words and job_titles as dataframes
gendered_words = pd.read_json(os.path.join(EXTERNAL_DATA_DIR, 'gendered_words.json'))
job_titles = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'job_titles.txt'), sep='\t', header=None)

job_titles.columns = ['job_title']

In [4]:
gendered_words = gendered_words[['word', 'wordnet_senseno', 'gender']]
gendered_words = gendered_words[gendered_words.gender.isin(['m', 'f'])].reset_index(drop=True)

mapping = {'m':0, 'f':1}
gendered_words['gender_binary'] = gendered_words.gender.map(mapping)

gendered_words['word'] = gendered_words['word'].str.replace('_', ' ', regex=False)
gendered_words['n_tokens'] = gendered_words['word'].apply(lambda x: len(x.split()))
gendered_words = gendered_words[gendered_words.n_tokens==1]
gendered_words.head()

Unnamed: 0,word,wordnet_senseno,gender,gender_binary,n_tokens
0,abbess,abbess.n.01,f,1,1
1,abbot,abbot.n.01,m,0,1
3,actress,actress.n.01,f,1,1
4,adonis,adonis.n.01,m,0,1
5,adulteress,adulteress.n.01,f,1,1


In [5]:
# apply bert tokenizer to the words and create another col for it
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

text = gendered_words.word.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

gendered_words['bert_token'] = list(word_embeddings)




In [6]:
gendered_words.head()

Unnamed: 0,word,wordnet_senseno,gender,gender_binary,n_tokens,bert_token
0,abbess,abbess.n.01,f,1,1,"[-0.38354972, -0.17998055, 0.27794936, 0.08237..."
1,abbot,abbot.n.01,m,0,1,"[-0.38868162, -0.09723526, 0.022723716, 0.1652..."
3,actress,actress.n.01,f,1,1,"[0.044713546, -0.2707646, -0.26443234, -0.0969..."
4,adonis,adonis.n.01,m,0,1,"[-0.43801284, -0.117105216, -0.043750063, -0.0..."
5,adulteress,adulteress.n.01,f,1,1,"[-0.029933628, -0.020349426, -0.4971868, -0.11..."


In [7]:
import sklearn

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = np.array(gendered_words.bert_token.tolist())
y = np.array(gendered_words.gender_binary.tolist())
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5))
clf.fit(X, y)



In [8]:
svc = clf.named_steps['linearsvc']

# Get the coefficients and intercept
coefficients = svc.coef_
intercept = svc.intercept_

# Flatten the coefficients (for binary classification)
flat_coefficients = coefficients.flatten()

# Create a DataFrame for coefficients
coef_df = pd.DataFrame(flat_coefficients, columns=['coefficient'])
coef_df.index.name = 'feature'
coef_df.reset_index(inplace=True)
coef_df['absolute_coefficient'] = np.abs(coef_df['coefficient'])
coef_df_sorted = coef_df.sort_values(by='absolute_coefficient', ascending=False)
coef_df_sorted.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
106,106,-0.145026,0.145026
314,314,-0.137707,0.137707
603,603,0.127591,0.127591
756,756,0.126499,0.126499
572,572,0.126127,0.126127
550,550,-0.125207,0.125207
417,417,-0.118525,0.118525
208,208,-0.114189,0.114189
170,170,-0.113101,0.113101
37,37,0.11139,0.11139


In [9]:
n_features = 70

selected_features = coef_df_sorted.feature[:n_features].values
selected_features

array([106, 314, 603, 756, 572, 550, 417, 208, 170,  37, 651, 705, 597,
       396, 470, 546, 704, 143, 650, 691,  73, 371, 111, 410, 359, 669,
       120, 161,  48,  10, 264, 451,  94,  13, 172, 596, 104, 743,  39,
       698, 690, 477, 295, 365, 490, 198,  65, 442,  67, 558, 253,  76,
       702, 762, 353, 335, 564, 585, 188, 430, 729, 715, 499,  64, 129,
       737, 337, 749, 103, 554])

In [10]:
import re

job_titles['job_title_clean'] = job_titles['job_title'].apply(lambda x: re.sub(r'\b\d+\w*\b', '', x))
job_titles

job_df = pd.DataFrame()
job_df['job_title_clean'] = job_titles['job_title_clean'].str.split('/').explode().reset_index(drop=True)

job_df['n_tokens'] = job_df.job_title_clean.apply(lambda x: len(x.split()))
job_df = job_df[job_df.n_tokens==1].reset_index(drop=True)
job_df.drop_duplicates(subset="job_title_clean", inplace=True)

job_df

Unnamed: 0,job_title_clean,n_tokens
0,pressman,1
1,dealer,1
3,animator,1
4,artist,1
5,designer,1
...,...,...
2254,youtuber,1
2255,zanjero,1
2256,zigzagger,1
2257,zoogler,1


: 

In [11]:
text = job_titles.job_title_clean.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

In [None]:
n_feature_embeddings = word_embeddings[:, selected_features]

job_titles['bert_token'] = list(n_feature_embeddings)

In [None]:
from sklearn.decomposition import PCA

# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
principal_components = pca.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
job_titles['pc1'] = principal_components[:, 0]
job_titles['pc2'] = principal_components[:, 1]

job_titles

Unnamed: 0,job_title,job_title_clean,n_tokens,bert_token,pc1,pc2
0,1st pressman,pressman,1,"[-0.02525172, -0.29282388, 0.17871192, -0.0461...",0.114934,0.032091
1,21 dealer,dealer,1,"[-0.06911022, -0.08804909, 0.35526076, -0.1440...",-0.759271,-0.337779
2,2nd pressman,pressman,1,"[-0.02525172, -0.29282388, 0.17871192, -0.0461...",0.114935,0.032092
3,3d animator,animator,1,"[0.02146821, 0.07563615, 0.5659466, -0.1606371...",-0.328696,-0.251081
4,3d artist,artist,1,"[-0.31041816, 0.14098145, 0.5045248, -0.062597...",-0.116660,0.088268
...,...,...,...,...,...,...
1856,youtuber,youtuber,1,"[-0.13817228, -0.29632846, 0.017007606, -0.079...",-0.182109,0.034823
1857,zanjero,zanjero,1,"[-0.20524062, 0.11024918, -0.06103464, -0.0392...",-0.417132,0.432328
1858,zigzagger,zigzagger,1,"[-0.058984745, -0.1581163, 0.17500138, 0.00277...",-0.503511,0.263077
1859,zoogler,zoogler,1,"[0.13334583, -0.032586005, 0.21488433, -0.2609...",-0.674707,0.245722


In [None]:
import plotly.express as px

fig = px.scatter(job_titles, x='pc1', y='pc2', hover_name='job_title',
                 title='Interactive Map of Job Titles',
                 labels={'pc1': 'Principal Component 1', 'pc2': 'Principal Component 2'})

# Update layout to improve the appearance
fig.update_layout(title='Interactive Job Titles Map',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  hovermode='closest')

# Show the plot
fig.show()