# Data collection and preprocessing

## Setup

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pickle
import sys
from pathlib import Path

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

from utils.preprocessing_helpers import download_lyrics, preprocess_corpus
from utils.model_download_helpers import download_word2vec_model

## Downloading and preprocessing the rap lyrics

In [None]:
# Data is from 2024 ACL-SRW paper "A Computational Analysis and Exploration of Linguistic Borrowings in French Rap Lyrics" by Lucas Zurbuchen and Rob Voigt.
# Paper: https://aclanthology.org/2024.acl-srw.27.pdf
# Source: https://github.com/ljz112/CLResearch/tree/main/dataEntries 
# Download the data
lyrics_array = download_lyrics()

# Preprocess the data 
# Non-lemmatized version
preprocess_corpus(texts=lyrics_array,
                  lemmatize=False,
                  save_path="data/processed_lyrics.pkl")

# # Lemmatized version
# preprocess_corpus(texts=lyrics_array,
#                   lemmatize=True,
#                   save_path="data/processed_lyrics_lemmatized.pkl")

In [6]:
# Load the data
# Load the lyrics data
with open("../data/french_rap_lyrics_raw.pkl", "rb") as f:
    lyrics_df = pickle.load(f)

# Load the preprocessed (non lemmatized) data
with open("../data/processed_lyrics.pkl", "rb") as f:
    corpus = pickle.load(f)

# # Load the lemmatized data
# with open("data/processed_lyrics_lemmatized.pkl", "rb") as f:
#     lemmatized_corpus = pickle.load(f)

In [7]:
print(f"Number of songs: {len(corpus)}")
print(corpus[0])

Number of songs: 8208
['moi', 'je', 'sais', 'ce', 'que', 'veux', 'tu', 'sais', 'est', 'quoi', 'mon', 'but', 'être', 'un', 'putain', 'de', 'bourgeois', 'gagner', 'du', 'blé', 'rien', 'glander', 'je', 'ne', 'veux', 'surtout', 'pas', 'retourner', 'où', 'étais', 'je', 'ne', 'veux', 'plus', 'cirer', 'les', 'pompes', 'un', 'enculé', 'pour', 'avoir', 'de', 'quoi', 'vivre', 'ai', 'sourire', 'tant', 'que', 'manque', 'pas', 'billets', 'banque', 'de', 'plaisirs', 'charnels', 'blindé', 'comme', 'un', 'tank', 'pris', 'sous', 'le', 'charme', 'gonflé', 'comme', 'une', 'grosse', 'paire', 'de', 'mamelles', 'moi', 'fric', 'ça', 'fait', 'bander', 'comme', 'le', 'boule', 'julia', 'channel', 'demandez', 'mes', 'partenaires', 'qu', 'est', 'ce', 'qui', 'fait', 'tourner', 'la', 'planète', 'le', 'sexe', 'les', 'biftons', 'le', 'pouvoir', 'et', 'les', 'business', 'pas', 'nets', 'vise', 'aux', 'manettes', 'rien', 'que', 'des', 'proxénètes', 'avise', 'les', 'maquisards', 'au', 'qg', 'eux', 'ils', 'connaissent', '

## Downloading off-the-shelf French word embeddings

### Word2Vec (2015)

Source: https://fauconnier.github.io/#data

1. frWac2Vec
    - Trained on the FrWac corpus (1.6 billion words)
    - Available in a CBOW & skip-gram, dimensions (200, 500, 700, 1000), and cutoffs (0, 10, 50, 100, 200)

2. frWiki2Vec
    - Trained on the FrWiki dump (600 millions words)
    - Available in a CBOW & skip-gram, dimensions (500, 700, 1000), and cutoffs (10, 100, 200)

### Word2Vec (2022)

The most recent French Word2Vec embeddings were created by Abdine et al. (2022). 

1. Two models were trained on a 33GB shuffled portion of the French corpus used to train FlauBERT

2. The other two models were trained on the 33GB deduplicated French corpus collected from the web

However, the embeddings are only available upon request at http://nlp.polytechnique.fr/#french. I requested them on the 03/04, but did not get a response. The authors use the models from Fauconnier (2015) as the baseline in their experiment. 

### fastText

Source: https://fasttext.cc/docs/en/crawl-vectors.html

"We distribute pre-trained word vectors for 157 languages, trained on Common Crawl and Wikipedia using fastText. These models were trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives. We also distribute three new word analogy datasets, for French, Hindi and Polish."

In [None]:
# Load the Fauconier (2015) models
Fauconnier_url_dict = {
    "frWac_non_lem_200_cbow_cut100_url": "https://embeddings.net/embeddings/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin",
    "frWac_non_lem_200_skipgram_cut100_url": "https://embeddings.net/embeddings/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin",
    # "frWac_non_lem_500_skipgram_cut200_url": "https://embeddings.net/embeddings/frWac_non_lem_no_postag_no_phrase_500_skip_cut200.bin",
    "frWiki_non_lem_1000_cbow_cut100_url": "https://embeddings.net/embeddings/frWiki_no_lem_no_postag_no_phrase_1000_cbow_cut100.bin",
    "frWiki_non_lem_1000_skipgram_cut100_url": "https://embeddings.net/embeddings/frWiki_no_lem_no_postag_no_phrase_1000_skip_cut100.bin",
}

for model_name, model_url in Fauconnier_url_dict.items():
    print(f"Downloading {model_name}...")
    download_word2vec_model(model_url)