In [214]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import torch

import sys
sys.path.append('/home/nauel/bert_gender_bias')

from pipelines.utils.paths import EXTERNAL_DATA_DIR, INTERIM_DATA_DIR

In [215]:
gendered_words = pd.read_csv(os.path.join(INTERIM_DATA_DIR, 'gendered_words.csv'), sep="|")
jobs_df = pd.read_csv(os.path.join(INTERIM_DATA_DIR, 'jobs.csv'), sep="|")
print(jobs_df.head())
print(gendered_words.head())

  job_title_clean  n_tokens
0        pressman         1
1          dealer         1
2        animator         1
3          artist         1
4        designer         1
  gender     word  gender_binary
0   male    actor              0
1   male      guy              0
2   male     guys              0
3   male       he              0
4   male  herself              0


# Tokenize Gendered Words

In [216]:
# apply bert tokenizer to the words and create another col for it
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

text = gendered_words.word.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

gendered_words['bert_token'] = list(word_embeddings)




In [217]:
gendered_words.head()

Unnamed: 0,gender,word,gender_binary,bert_token
0,male,actor,0,"[0.12366218, -0.082570516, -0.5964365, -0.1320..."
1,male,guy,0,"[-0.019587735, -0.1438058, 0.10323956, 0.09930..."
2,male,guys,0,"[0.17785038, -0.12145782, 0.1496808, 0.2983950..."
3,male,he,0,"[0.05831127, -0.09141863, 0.21463205, 0.112496..."
4,male,herself,0,"[0.26475444, -0.09025365, 0.15711533, 0.060596..."


# Linear SVM 

In [218]:
import sklearn

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = np.array(gendered_words.bert_token.tolist())
y = np.array(gendered_words.gender_binary.tolist())
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5))
clf.fit(X, y)




Liblinear failed to converge, increase the number of iterations.



In [219]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1])

In [210]:
def get_top_features(clf):
    svc = clf.named_steps['linearsvc']

    coefficients = svc.coef_
    intercept = svc.intercept_

    flat_coefficients = coefficients.flatten()

    coef_df = pd.DataFrame(flat_coefficients, columns=['coefficient'])
    coef_df.index.name = 'feature'
    coef_df.reset_index(inplace=True)
    coef_df['absolute_coefficient'] = np.abs(coef_df['coefficient'])
    coef_df_sorted = coef_df.sort_values(by='absolute_coefficient', ascending=False)
    
    return coef_df_sorted

coef_df_sorted = get_top_features(clf)
coef_df_sorted.head(20)

Unnamed: 0,feature,coefficient,absolute_coefficient
571,571,0.143205,0.143205
708,708,-0.121285,0.121285
154,154,-0.116984,0.116984
194,194,-0.112798,0.112798
645,645,0.111434,0.111434
600,600,0.110956,0.110956
481,481,0.10893,0.10893
367,367,-0.107679,0.107679
5,5,-0.100707,0.100707
695,695,0.096902,0.096902


In [205]:
n_features = 50

selected_features = coef_df_sorted.feature[:n_features].values
selected_features

array([571, 708, 154, 194, 645, 600, 481, 367,   5, 695, 687,   9,  96,
       426, 741, 501, 244, 538, 525, 397, 126, 531, 606, 722, 248, 514,
       103,  39,  11, 661, 553, 106, 292, 340, 403, 478, 401, 614, 755,
       393, 537,  22,  61, 539, 492, 177, 391,  25, 658, 376])

# Tokenize Jobs

In [206]:
text = job_df.job_title_clean.values.tolist()
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    output = model(**encoded_input)
    
embeddings = output.last_hidden_state
word_embeddings = embeddings.mean(dim=1).numpy()

In [207]:
n_feature_embeddings = word_embeddings[:, selected_features]

job_df['bert_token'] = list(n_feature_embeddings)

# Perform PCA

In [208]:
from sklearn.decomposition import PCA

# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
principal_components = pca.fit_transform(n_feature_embeddings)

# Add principal components to the DataFrame
job_df['pc1'] = principal_components[:, 0]
job_df['pc2'] = principal_components[:, 1]

job_df.sort_values(by='pc1', ascending=True).head(20)
job_df[job_df['job_title_clean']=='nurse']

Unnamed: 0,job_title_clean,n_tokens,bert_token,pc1,pc2
1190,nurse,1,"[0.19648932, 0.23947293, -0.24427141, 0.057751...",-0.098263,-0.230263


In [209]:
import plotly.express as px

fig = px.scatter(job_df, x='pc1', y='pc2', hover_name='job_title_clean',
                 title='Interactive Map of Job Titles',
                 labels={'pc1': 'Principal Component 1', 'pc2': 'Principal Component 2'})

# Update layout to improve the appearance
fig.update_layout(title='Interactive Job Titles Map',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  hovermode='closest')

# Show the plot
fig.show()