# French rap word embeddings training

## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pickle
import sys
from pathlib import Path

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

from utils.training_helpers import train_word2vec

Code inspired from Schmahl, K. G., Viering, T., Makrodimitris, S., Jahfari, A. N., Tax, D., & Loog, M. (2020). Is Wikipedia succeeding in reducing gender bias? Assessing changes in gender bias in Wikipedia using word embeddings. NLPCSS. https://doi.org/10.18653/V1/2020.NLPCSS-1.11

https://gitlab.com/kschmahl/wikipedia-gender-bias-over-time/

In [None]:
# Load corpus
with open("../data/processed_lyrics.pkl", "rb") as f:
    corpus = pickle.load(f)

In [None]:
model_dir = '../models'

# Fixed training parameters
fixed_params = {
    'epochs': 5,
    'workers': 4,
    'seed': 35
}

# Window size of 5 consistently performed best in validation metrics
model_configs = [
    {'vector_size': 100, 'window': 5, 'min_count': 5,   'sg': 1},
    {'vector_size': 200, 'window': 5, 'min_count': 5,  'sg': 1},
    {'vector_size': 100, 'window': 5, 'min_count': 5,  'sg': 0},
    {'vector_size': 200, 'window': 5, 'min_count': 5, 'sg': 0},
]

# Train each model
for i, config in enumerate(model_configs, 1):
    vector_size = config['vector_size']
    min_count = config['min_count']
    window = config['window']
    sg = config['sg']
    algo = 'skipgram' if sg == 1 else 'cbow'
    lemmatized = config.get('lemmatized', False)

    print(f"Training model {i}/{len(model_configs)} | {algo}, dim={vector_size}, min_count={min_count}, window = {window}, lemmatized={lemmatized}")

    # Merge parameters
    train_params = {
        'vector_size': vector_size,
        'min_count': min_count,
        'window': window,
        'sg': sg,
        'lemmatized': lemmatized,
        **fixed_params
    }

    # Train the model (if gets saved automatically to a subdirectory)
    model = train_word2vec(
        corpus=corpus,
        output_dir=model_dir,
        **train_params
    )