# Semantic analysis of sexism in French Rap using word embeddings

Natural Language Processing for the Social Sciences

Matteo Larrode, MSc Social Data Science

## Setup

In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Data is from 2024 ACL-SRW paper "A Computational Analysis and Exploration of Linguistic Borrowings in French Rap Lyrics" by Lucas Zurbuchen and Rob Voigt.
# Paper: https://aclanthology.org/2024.acl-srw.27.pdf
# Source: https://github.com/ljz112/CLResearch/tree/main/dataEntries

# Download the data from the GitHub repository & create csv files:
# python data_setup.py
# Or to skip the download:
# python data_setup.py --skip-download

In [1]:
# Load the data
import pandas as pd

songs_df = pd.read_pickle('data/processed_french_rap_songs.pkl')

songs_df.info()

print(type(songs_df['cleaned_lyrics'].iloc[0]))
print(songs_df['cleaned_lyrics'].iloc[0][:100])

<class 'pandas.core.frame.DataFrame'>
Index: 8665 entries, 0 to 11655
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              8665 non-null   int64 
 1   name            8665 non-null   object
 2   artists         8665 non-null   object
 3   releaseDate     8665 non-null   object
 4   popularity      8665 non-null   int64 
 5   lyrics          8665 non-null   object
 6   cleaned_lyrics  8665 non-null   object
 7   is_french       8665 non-null   bool  
 8   year            8665 non-null   int64 
 9   decade          8665 non-null   object
dtypes: bool(1), int64(3), object(6)
memory usage: 685.4+ KB
<class 'list'>
['moi', 'je', 'sais', 'ce', 'que', 'veux', 'tu', 'sais', 'est', 'quoi', 'mon', 'but', 'etre', 'un', 'putain', 'de', 'bourgeois', 'gagner', 'du', 'ble', 'rien', 'glander', 'je', 'ne', 'veux', 'surtout', 'pas', 'retourner', 'ou', 'etais', 'je', 'ne', 'veux', 'plus', 'cirer', 'les', 'pompes', 'u

## Word Embedding Training

In [2]:
# Code inspired from Schmahl, K. G., Viering, T., Makrodimitris, S., Jahfari, A. N., Tax, D., & Loog, M. (2020). Is Wikipedia succeeding in reducing gender bias? Assessing changes in gender bias in Wikipedia using word embeddings. NLPCSS. https://doi.org/10.18653/V1/2020.NLPCSS-1.11
# https://gitlab.com/kschmahl/wikipedia-gender-bias-over-time/
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    window=10,
    min_count=5,
    workers=4,
)

w2v_model.build_vocab(songs_df['cleaned_lyrics'])

In [3]:
# Check the vocabulary size
print(f"Vocabulary size: {len(w2v_model.wv.key_to_index)}")

Vocabulary size: 26144


In [5]:
# Train the model
w2v_model.train(
    songs_df['cleaned_lyrics'],
    total_examples=w2v_model.corpus_count,
    epochs=w2v_model.epochs,
)

(15497923, 20912735)

In [6]:
# Save the model
w2v_model.save('models/word2vec_french_rap_1.model')

## Word Embedding Validation

In [16]:
# Check similarity
w2v_model.wv.most_similar("menage", topn=10)

[('veneneux', 0.5968782901763916),
 ('adapter', 0.5812106728553772),
 ('julien', 0.5681294202804565),
 ('catamaran', 0.5611140131950378),
 ('licencier', 0.5585771203041077),
 ('nettoyage', 0.5571639537811279),
 ('brad', 0.5561196208000183),
 ('jaguar', 0.555712878704071),
 ('poup', 0.554553210735321),
 ('brinks', 0.5511651039123535)]

## Gender Stereotypes Analysis