In [15]:
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import torch

import sys
sys.path.append('/home/nauel/bert_gender_bias')
import warnings
warnings.filterwarnings("ignore")


from pipelines.utils.paths import EXTERNAL_DATA_DIR, INTERIM_DATA_DIR

In [16]:
gendered_words = pd.read_pickle(os.path.join(INTERIM_DATA_DIR, 'gender_binary_words_TOKEN.pkl'))

In [17]:
gendered_words.dropna(inplace=True)
print(gendered_words.shape)
gendered_words.head(1000)

(431, 5)


Unnamed: 0,word,gender_binary,bert_token,word2vec_token,word_count
0,abbot,0,"[-0.39571184, -0.093838364, 0.06868138, 0.1723...","[0.40039062, 0.41015625, 0.36523438, 0.2207031...",1
1,abbots,0,"[-0.18790531, -0.077521764, -0.3876859, 0.1816...","[0.30664062, 0.33398438, 0.1953125, 0.50390625...",1
2,adulterer,0,"[-0.45352724, -0.38397712, -0.25277817, -0.266...","[0.42773438, -0.28515625, -0.0625, 0.020263672...",1
3,adulterers,0,"[-0.17249976, -0.15101261, -0.3717648, -0.0279...","[-0.14160156, -0.22558594, -0.042236328, 0.208...",1
4,airman,0,"[-0.20044291, -0.020562049, -0.276353, -0.0372...","[0.49023438, -0.15917969, -0.022460938, -0.024...",1
...,...,...,...,...,...
510,woman,1,"[-0.34769166, -0.20875905, 0.12518345, 0.07620...","[0.24316406, -0.07714844, -0.103027344, -0.107...",1
511,womankind,1,"[-0.24836135, -0.283562, -0.19828826, 0.071805...","[-0.096191406, -0.055908203, 0.014404297, 0.22...",1
512,womanly,1,"[-0.3582225, -0.31381592, -0.16148914, 0.21661...","[0.01965332, -0.059570312, 0.119628906, 0.1474...",1
513,womanpower,1,"[-0.042981803, -0.32953677, 0.07738724, 0.1452...","[-0.0041503906, -0.06225586, 0.044921875, 0.13...",1


In [19]:
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

token_col = 'bert_token'
X = gendered_words[token_col].values.tolist()
y = np.array(gendered_words.gender_binary.tolist())
logit = LogisticRegression(solver="liblinear", penalty="l1", C=0.1873817422860384)
logit.fit(X, y)
print("Accuracy: ", logit.score(X, y))
print("Number of non-zero coefficients: ", np.sum(logit.coef_ != 0))
print("Number of zero coefficients: ", np.sum(logit.coef_ == 0))
print("Number of coefficients: ", len(logit.coef_[0]))

Accuracy:  0.9095127610208816
Number of non-zero coefficients:  35
Number of zero coefficients:  733
Number of coefficients:  768


In [20]:
# Convert numpy array to pandas Series
coef_series = pd.Series(best_logit.coef_[0])

# Sort the values in descending order
sorted_coefs = coef_series.sort_values(ascending=False)

sorted_coefs[:50]


52     1.946643
394    1.548990
586    1.423710
622    0.873934
258    0.674888
521    0.651731
138    0.595964
342    0.581594
653    0.492859
286    0.416709
141    0.312278
33     0.282058
308    0.145957
756    0.074233
493    0.056143
663    0.048093
464    0.044794
17     0.011241
484    0.000000
509    0.000000
510    0.000000
511    0.000000
512    0.000000
485    0.000000
513    0.000000
514    0.000000
492    0.000000
507    0.000000
516    0.000000
483    0.000000
517    0.000000
518    0.000000
482    0.000000
481    0.000000
519    0.000000
515    0.000000
486    0.000000
506    0.000000
505    0.000000
491    0.000000
490    0.000000
494    0.000000
495    0.000000
496    0.000000
497    0.000000
498    0.000000
499    0.000000
520    0.000000
501    0.000000
488    0.000000
dtype: float64

In [21]:
# Get the coefficients
coefficients = best_logit.coef_[0]
coefficients_abs = np.abs(coefficients)

logit_coef = pd.DataFrame({'feature': range(len(coefficients)), 'coefficient': coefficients, 'absolute_coefficient': coefficients_abs})
logit_coef = logit_coef.sort_values(by='absolute_coefficient', ascending=False)
logit_coef = logit_coef[logit_coef.absolute_coefficient > 0]
logit_coef.head(200)


Unnamed: 0,feature,coefficient,absolute_coefficient
52,52,1.946643,1.946643
394,394,1.54899,1.54899
104,104,-1.544832,1.544832
74,74,-1.424545,1.424545
586,586,1.42371,1.42371
1,1,-1.095554,1.095554
374,374,-0.986125,0.986125
549,549,-0.941456,0.941456
622,622,0.873934,0.873934
765,765,-0.871715,0.871715


In [22]:
selected_features = logit_coef.feature.values

gendered_words['selected_tokens'] = gendered_words.bert_token.apply(lambda x: [x[i] for i in selected_features])
gendered_words.head()

Unnamed: 0,word,gender_binary,bert_token,word2vec_token,word_count,selected_tokens
0,abbot,0,"[-0.39571184, -0.093838364, 0.06868138, 0.1723...","[0.40039062, 0.41015625, 0.36523438, 0.2207031...",1,"[-0.26285443, 0.10638625, -0.3551142, 0.350361..."
1,abbots,0,"[-0.18790531, -0.077521764, -0.3876859, 0.1816...","[0.30664062, 0.33398438, 0.1953125, 0.50390625...",1,"[-0.14765893, 0.17127527, -0.105251506, 0.6874..."
2,adulterer,0,"[-0.45352724, -0.38397712, -0.25277817, -0.266...","[0.42773438, -0.28515625, -0.0625, 0.020263672...",1,"[-0.06270412, 0.06840992, -0.27433926, 0.41119..."
3,adulterers,0,"[-0.17249976, -0.15101261, -0.3717648, -0.0279...","[-0.14160156, -0.22558594, -0.042236328, 0.208...",1,"[0.2360422, 0.0927681, -0.4616234, 0.20485066,..."
4,airman,0,"[-0.20044291, -0.020562049, -0.276353, -0.0372...","[0.49023438, -0.15917969, -0.022460938, -0.024...",1,"[0.06806019, 0.07705496, -0.3983753, 0.1153851..."


In [23]:
from sklearn.decomposition import PCA

# Perform PCA to reduce to 2 dimensions
pca = PCA(n_components=2)

X = gendered_words['selected_tokens'].tolist()
principal_components = pca.fit_transform(X)
print("Explained variance ratio: ", pca.explained_variance_ratio_)
gendered_words['pc1'] = principal_components[:, 0]
gendered_words['pc2'] = principal_components[:, 1]

gendered_words['pc1_norm'] = (gendered_words['pc1'] - gendered_words['pc1'].min()) / (gendered_words['pc1'].max() - gendered_words['pc1'].min())
gendered_words['pc2_norm'] = (gendered_words['pc2'] - gendered_words['pc2'].min()) / (gendered_words['pc2'].max() - gendered_words['pc2'].min())

gendered_words.head()

Explained variance ratio:  [0.22990332 0.09048896]


Unnamed: 0,word,gender_binary,bert_token,word2vec_token,word_count,selected_tokens,pc1,pc2,pc1_norm,pc2_norm
0,abbot,0,"[-0.39571184, -0.093838364, 0.06868138, 0.1723...","[0.40039062, 0.41015625, 0.36523438, 0.2207031...",1,"[-0.26285443, 0.10638625, -0.3551142, 0.350361...",-0.564742,0.253292,0.091527,0.581638
1,abbots,0,"[-0.18790531, -0.077521764, -0.3876859, 0.1816...","[0.30664062, 0.33398438, 0.1953125, 0.50390625...",1,"[-0.14765893, 0.17127527, -0.105251506, 0.6874...",0.495572,0.128064,0.415104,0.528384
2,adulterer,0,"[-0.45352724, -0.38397712, -0.25277817, -0.266...","[0.42773438, -0.28515625, -0.0625, 0.020263672...",1,"[-0.06270412, 0.06840992, -0.27433926, 0.41119...",0.655863,0.354184,0.46402,0.624543
3,adulterers,0,"[-0.17249976, -0.15101261, -0.3717648, -0.0279...","[-0.14160156, -0.22558594, -0.042236328, 0.208...",1,"[0.2360422, 0.0927681, -0.4616234, 0.20485066,...",0.691047,0.238122,0.474757,0.575187
4,airman,0,"[-0.20044291, -0.020562049, -0.276353, -0.0372...","[0.49023438, -0.15917969, -0.022460938, -0.024...",1,"[0.06806019, 0.07705496, -0.3983753, 0.1153851...",0.519981,0.254454,0.422553,0.582132


In [24]:
import plotly.express as px

color_map = {0: '#87CEEB', 1: '#FFC0CB'}

# Create the scatter plot
fig = px.scatter(gendered_words, 
                 x='pc1', 
                 y='pc2', 
                 color='gender_binary',  # Use the 'gender' column to determine colors
                 hover_name='word',
                 title='Interactive Map of Job Titles',
                 labels={'pc1': 'Principal Component 1', 'pc2': 'Principal Component 2'},
                 color_continuous_scale=list(color_map.values()),
                 color_discrete_map=color_map,
                 opacity =0.8)  

# Update layout to improve the appearance
fig.update_layout(title='Interactive Job Titles Map',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  plot_bgcolor='white',
                  paper_bgcolor='white',
                  hovermode='closest')

fig.update_layout(
    xaxis=dict(
        gridcolor='lightgray',
        zerolinecolor='lightgray'
    ),
    yaxis=dict(
        gridcolor='lightgray',
        zerolinecolor='lightgray'
    )
)

# Show the plot
fig.show()


In [25]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
import plotly.express as px

# Assuming 'gendered_words' DataFrame is already defined with 'pc1', 'pc2', 'gender_binary', 'word'

# Prepare the data
X = gendered_words[['pc1', 'pc2']]
y = gendered_words['gender_binary']

# Train a linear SVM
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X, y)

# Predict the class labels
gendered_words['svm_cluster'] = svm.predict(X)

# Define a color map for the SVM clusters
svm_color_map = {0: '#87CEEB', 1: '#FFC0CB'}  # Blue for 0, Pink for 1

# Create the scatter plot for SVM classification
fig = px.scatter(gendered_words, 
                 x='pc1', 
                 y='pc2', 
                 color='svm_cluster',  # Use the 'svm_cluster' column for coloring
                 hover_name='word',
                 title='Linear SVM Classification of Job Titles',
                 labels={'pc1': 'Principal Component 1', 'pc2': 'Principal Component 2'},
                 color_discrete_map=svm_color_map,
                 opacity=0.8)  

# Update layout to improve the appearance
fig.update_layout(title='Linear SVM Classification of Job Titles',
                  xaxis_title='Principal Component 1',
                  yaxis_title='Principal Component 2',
                  plot_bgcolor='white',
                  paper_bgcolor='white',
                  hovermode='closest')

fig.update_layout(
    xaxis=dict(
        gridcolor='lightgray',
        zerolinecolor='lightgray'
    ),
    yaxis=dict(
        gridcolor='lightgray',
        zerolinecolor='lightgray'
    )
)

# Show the plot
fig.show()

print("Accuracy: ", svm.score(X, y))

Accuracy:  0.7981438515081206


In [26]:
n = 20
pcs = ['pc1', 'pc2']
top_n_desc_pc1 = gendered_words.nlargest(n, pcs[0]).word.values
top_n_asc_pc1 = gendered_words.nsmallest(n, pcs[0]).word.values
top_n_desc_pc2 = gendered_words.nlargest(n, pcs[1]).word.values
top_n_asc_pc2 = gendered_words.nsmallest(n, pcs[1]).word.values

top_n_desc_pc1_values = gendered_words.nlargest(n, pcs[0])[f'{pcs[0]}'].values
top_n_asc_pc1_values = gendered_words.nsmallest(n, pcs[0])[f'{pcs[0]}'].values
top_n_desc_pc2_values = gendered_words.nlargest(n, pcs[1])[f'{pcs[1]}'].values
top_n_asc_pc2_values = gendered_words.nsmallest(n, pcs[1])[f'{pcs[1]}'].values

table = pd.DataFrame({
    'Top 20 Descending PC1': top_n_desc_pc1,
    'PC1_desc': top_n_desc_pc1_values,
    'Top 20 Ascending PC1': top_n_asc_pc1,
    'PC1_asc': top_n_asc_pc1_values,
    'Top 20 Descending PC2': top_n_desc_pc2,
    'PC2_desc': top_n_desc_pc2_values,
    'Top 20 Ascending PC2': top_n_asc_pc2,
    'PC2_asc': top_n_asc_pc2_values
})

table

Unnamed: 0,Top 20 Descending PC1,PC1_desc,Top 20 Ascending PC1,PC1_asc,Top 20 Descending PC2,PC2_desc,Top 20 Ascending PC2,PC2_asc
0,seamstresses,2.412194,manpower,-0.864664,businessman,1.237083,duchesses,-1.114449
1,paternity,2.083455,butches,-0.855746,sportsman,0.968373,aunts,-0.990034
2,seamstress,1.923979,manly,-0.84896,handyman,0.961776,grandmothers,-0.921414
3,stepdaughter,1.818139,brethren,-0.845347,councilman,0.961043,matriarchs,-0.907386
4,ma'am,1.800617,brotherhood,-0.843496,serviceman,0.88602,moms,-0.886278
5,postmistresses,1.758097,his,-0.817785,policeman,0.880419,sisterhoods,-0.873518
6,headmistresses,1.75592,lords,-0.811032,cameraman,0.875725,grandmas,-0.860966
7,countrywoman,1.728281,heir,-0.806829,mailman,0.872584,duchess,-0.854437
8,congresswomen,1.700795,wizard,-0.79676,fireman,0.872275,matriarchy,-0.821157
9,spokeswomen,1.689544,dukes,-0.796504,strongman,0.855247,godmothers,-0.793633
