In [13]:
import pandas as pd
import spacy

from tqdm.notebook import tqdm
tqdm.pandas()

compute_freq = False


Ce code réalise une lemmatization à partir d'un modèle de machine learning issu de spacy.
installation du modèle : 
> python -m spacy download fr_core_news_sm

Ensuite on supprime certains mots fréquents, remis en forme et trouvés grâce à :
> from nltk.probability import FreqDist

Les nombres sont aussi filtrés.

J'ai réalisé cette approche car un  cleaning manuel était trop long (code en n² où n est le nombre de mots total du corpus). Le défaut de cette aproche est que potentiellement est elle trop bourrine et certains mots spécifiques qui nous intéressent ont disparu.

Il faut valider par une approche lexico et LDA les résultats => cf le code analyse_lexico.qmd

Parmi les effets négatifs : spacy lemmatize trans en tran => meh, on a donc dans le code R fusionné les token 'tran' et 'trans' mais on a aussi fixé ce problème en créant une expection dans la lemmatization.

# Opening data

In [14]:
fichier = "../data/intermediate/base_merged.csv"

df = pd.read_csv(fichier, sep=';', quotechar='"')

df.sample(5)

Unnamed: 0,ID ASSO,ID ARTICLE,contenu,Type de document,Auteur,Date,Titre,URL
138,21,210008,Traduction principalement DeepL d’un article p...,analyse,résistance lesbienne,2021-10-26,« Certaines femmes transgenres nous poussent à...,
47,17,170019,"7 janvier 2015, 11h30, la rédaction de Charli...",analyse,,,,
26,12,120027,Quelle est la différence entre homosexualité e...,opinion,Marguerite Stern,09/10/2022,Quelle est la différence entre homosexualité e...,https://www.femelliste.com/articles-femellisme...
184,3,30004,Le pronom « iel » ou le sexe des anges\nLa thé...,Article d'opinion,,,,https://www.generation-zemmour.fr/le-pronom-ie...
60,17,170032,La sociologie étudie et explique la dimension...,analyse,,,,


# Cleaning and tokenisation

## Lemmatization with Spacy

We also do some first filtering with Spacy

In [15]:
# Load the French language model
nlp = spacy.load("fr_core_news_sm")

# Define your list of words to exclude
with open("../data/intermediate/words_to_filter.txt", "r", encoding="utf-8") as file:
    words_to_filter = file.read().splitlines()

word_meaning = []

# Function to tokenize, filter, and lemmatize text, excluding numbers
def tokenize_filter_and_lemmatize(text):
    # Traite le texte avec spaCy
    doc = nlp(text)
    # Initialise une liste pour le résultat lemmatisé
    lemmatized_tokens = []

    for token in doc:
        # Vérifie si le token est "trans", sinon applique la lemmatisation et le filtre
        if token.text == "trans":
            lemmatized_tokens.append(token.text)
        elif token.lemma_ not in words_to_filter and not token.is_digit:
            lemmatized_tokens.append(token.lemma_)
        
        if token.lemma_ == "tran":
            word_meaning.append(token.text)

    # Joint les tokens lemmatisés en une chaîne de texte
    lemmatized = " ".join(lemmatized_tokens)
    return lemmatized

# Apply the function to the 'contenu' column
df['lemmatized_contenu'] = df['contenu'].progress_apply(tokenize_filter_and_lemmatize)

# Print the dataframe to see the result
df.sample(5)

  0%|          | 0/488 [00:00<?, ?it/s]

Unnamed: 0,ID ASSO,ID ARTICLE,contenu,Type de document,Auteur,Date,Titre,URL,lemmatized_contenu
439,13,130069,"Chat, cheval, dinosaures: des élèves s’identif...",blog,TRADFEM,21/06/2023,"Chat, cheval, dinosaures: des élèves s’identif...",https://tradfem.wordpress.com/2023/06/21/chat-...,chat cheval dinosaure élève s’ identifier anim...
230,23,230005,\r\nENTRETIEN. À l’heure où l’Espagne vient d’...,Entretien,,,,https://www.observatoirepetitesirene.org/post/...,\r\n entretien l’ heure l’ espagne d’ adopter ...
318,22,220003,« La médecine face à la transidentité de genre...,Communiqué,,2022-02-28 00:00:00,,,médecine face transidentité genre enfant adole...
396,13,130026,Le viol punitif : Violence envers les lesbienn...,blog,TRADFEM,12/09/2023,Le viol punitif : Violence envers les lesbiennes,https://tradfem.wordpress.com/2023/09/12/le-vi...,viol punitif violence envers lesbiennestradfem...
144,21,210014,Traduction principalement DeepL de cet article...,analyse,résistance lesbienne,2021-12-20,Changement de définition,,traduction principalement deepl article écrire...


In [16]:
import numpy as np
np.unique(word_meaning)

array(['TRANS', 'Trans', 'tran', 'trans'], dtype='<U5')

## Checking Words frequency according to Spacy tokenization

In [17]:
from collections import Counter
import spacy

if compute_freq:
    # Load the French language model
    nlp = spacy.load("fr_core_news_sm")

    # Your list of words to filter and the tokenize_filter_and_lemmatize function should be defined here

    # Initialize a Counter object to hold the frequency of each token
    token_freq = Counter()

    # Function to update token frequency for a single document
    def update_token_frequency(text):
        global token_freq  # Reference the global Counter object
        # Process the text with spaCy
        doc = nlp(text)
        # Update the Counter with tokens from this document, excluding filtered words and numbers
        token_freq.update([token.text for token in doc if token.lemma_ not in words_to_filter and not token.is_digit])

    # Apply the function to each row in the 'lemmatized_contenu' column to update the global token frequency
    df['lemmatized_contenu'].progress_apply(update_token_frequency)

    # Find the top 10 most frequent tokens
    top_10_tokens = token_freq.most_common(10)

    # Print the top 10 most frequent tokens
    print("Top 10 most frequent tokens:")
    for token, freq in top_10_tokens:
        print(f"{token}: {freq}")


## Second filtering with nltk

the tokenization of spacy is better than the one I used in R (we do the LDA in R). The one in R is equivalent to the one of nltk. And for instance aujourd'hui appears as aujourd + ' + hui. So i need to run a second filter based on nltk tokenization.

In [18]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

# Ensure NLTK resources are downloaded (needed for tokenization)
nltk.download('punkt')

# Function to tokenize with NLTK and filter based on your criteria
def nltk_tokenize_and_filter(text):
    # Tokenize the text with NLTK, ensuring the text is treated as French
    tokens = word_tokenize(text, language='french')
    # Filter tokens: convert to lowercase, exclude if in words_to_filter or is a digit
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in words_to_filter and not token.isdigit()]
    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# Assuming 'df' is your DataFrame and 'lemmatized_contenu' contains the text to process
# Apply the function to each row in the 'lemmatized_contenu' column
df['lemmatized_contenu'] = df['lemmatized_contenu'].progress_apply(nltk_tokenize_and_filter)

# This results in a new column 'filtered_contenu' in your dataframe 'df' with the text processed as per your requirements


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leopo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/488 [00:00<?, ?it/s]

## Checking words frequency based on nltk tokenization

In [19]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import pandas as pd

if compute_freq:
    # Ensure you've downloaded the NLTK tokenizer models
    nltk.download('punkt')

    # Your DataFrame 'df' should already be loaded with the 'lemmatized_contenu' column ready

    # Initialize a Counter object for token frequencies
    token_freq = Counter()

    # Define your list of words to exclude, adjusted for your context
    words_to_filter = set([
        # Your list of words to filter
    ])

    # Function to tokenize and update token frequency for a single document using NLTK
    def update_token_frequency_nltk(text):
        global token_freq  # Reference the global Counter object
        # Tokenize the text using NLTK
        tokens = word_tokenize(text, language='french')
        # Update the Counter with tokens from this document, excluding filtered words and numbers
        token_freq.update([token.lower() for token in tokens if token.lower() not in words_to_filter and not token.isdigit()])

    # Apply the function to each row in the 'lemmatized_contenu' column to update the global token frequency
    df['lemmatized_contenu'].apply(update_token_frequency_nltk)

    # Find the top 10 most frequent tokens
    top_10_tokens = token_freq.most_common(10)

    # Print the top 10 most frequent tokens
    print("Top 10 most frequent tokens using NLTK tokenization:")
    for token, freq in top_10_tokens:
        print(f"{token}: {freq}")


# Export

In [20]:
fichier = "../data/intermediate/base_lemmatized.csv"
df.to_csv(fichier, sep=';', index=False, quotechar='"')