In [1]:
import pandas as pd
import spacy

from tqdm.notebook import tqdm
tqdm.pandas()

compute_freq = False


Ce code réalise une lemmatization à partir d'un modèle de machine learning issu de spacy.
installation du modèle : 
> python -m spacy download fr_core_news_sm

Ensuite on supprime certains mots fréquents, remis en forme et trouvés grâce à :
> from nltk.probability import FreqDist

Les nombres sont aussi filtrés.

J'ai réalisé cette approche car un  cleaning manuel était trop long (code en n² où n est le nombre de mots total du corpus). Le défaut de cette aproche est que potentiellement est elle trop bourrine et certains mots spécifiques qui nous intéressent ont disparu.

Il faut valider par une approche lexico et LDA les résultats => cf le code analyse_lexico.qmd

# Opening data

In [2]:
fichier = "../data/intermediate/base_merged.csv"

df = pd.read_csv(fichier, sep=';', quotechar='"')

df.sample(5)

Unnamed: 0,ID ASSO,ID ARTICLE,contenu,Type de document,Auteur,Date,Titre,URL
456,13,130086,Kathleen Stock se dit « modérée » alors que de...,blog,TRADFEM,31/05/2023,Kathleen Stock se dit « modérée » alors que de...,https://tradfem.wordpress.com/2023/05/31/kathl...
66,17,170038,Qu’il semble loin le jour où les femmes n’auro...,analyse,,,,
157,21,210027,Traduction de cet article écrit par Feminism I...,analyse,résistance lesbienne,2022-08-14,Quels choix personnels peuvent être critiqués?,
20,12,120021,"Le féminisme critique du genre, c’est quoi ?Le...",opinion,Dora Moutot & Marguerite Stern,18/10/2022,"Le féminisme critique du genre, c’est quoi ?",https://www.femelliste.com/articles-femellisme...
306,20,200078,Effets secondaires de la testostérone chez les...,Article d'opinion,,,,https://cryforrecognition.be/fr/effets-seconda...


# Cleaning and tokenisation

## Lemmatization with Spacy

We also do some first filtering with Spacy

In [3]:
# Load the French language model
nlp = spacy.load("fr_core_news_sm")

# Define your list of words to exclude
with open("../data/intermediate/words_to_filter.txt", "r", encoding="utf-8") as file:
    words_to_filter = file.read().splitlines()

tran_meaning = []

# Function to tokenize, filter, and lemmatize text, excluding numbers
def tokenize_filter_and_lemmatize(text):
    # Process the text using spaCy
    doc = nlp(text)
    # Extract lemmatized forms of the words, excluding specific tokens and numbers
    for token in doc:
        if token.lemma_ == "tran":
            tran_meaning.append(token.text)
    lemmatized = " ".join([token.lemma_ for token in doc if token.lemma_ not in words_to_filter and not token.is_digit])
    return lemmatized

# Apply the function to the 'contenu' column
df['lemmatized_contenu'] = df['contenu'].progress_apply(tokenize_filter_and_lemmatize)

# Print the dataframe to see the result
df.sample(5)

  0%|          | 0/488 [00:00<?, ?it/s]

Unnamed: 0,ID ASSO,ID ARTICLE,contenu,Type de document,Auteur,Date,Titre,URL,lemmatized_contenu
458,13,130088,La gauche a trahi les femmes en Espagne – elle...,blog,TRADFEM,31/05/2023,La gauche a trahi les femmes en Espagne – elle...,https://tradfem.wordpress.com/2023/05/31/la-ga...,gauche trahir femme espagne maintenant face co...
34,17,170006,Le 6 avril 2016 est un jour de victoire pour ...,analyse,,,,,avril jour victoire féministe allié progress...
59,17,170031,La mobilisation internationale pour retrouver ...,analyse,,,,,mobilisation international retrouver lycéen ni...
56,17,170028,Alors bien sûr il y a eu l’avis de la Commissi...,analyse,,,,,sûr l’ avis commission droit l’ homme avis n’ ...
53,17,170025,"11 août 2014 actualisation\n Dans ce texte, j...",analyse,,,,,août actualisation \n texte n’ prétention d...


In [4]:
tran_meaning

['trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'Trans',
 'Trans',
 'trans',
 'trans',
 'Trans',
 'trans',
 'Trans',
 'trans',
 'trans',
 'trans',
 'Trans',
 'trans',
 'trans',
 'trans',
 'Trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',
 'trans',


## Checking Words frequency according to Spacy tokenization

In [5]:
from collections import Counter
import spacy

if compute_freq:
    # Load the French language model
    nlp = spacy.load("fr_core_news_sm")

    # Your list of words to filter and the tokenize_filter_and_lemmatize function should be defined here

    # Initialize a Counter object to hold the frequency of each token
    token_freq = Counter()

    # Function to update token frequency for a single document
    def update_token_frequency(text):
        global token_freq  # Reference the global Counter object
        # Process the text with spaCy
        doc = nlp(text)
        # Update the Counter with tokens from this document, excluding filtered words and numbers
        token_freq.update([token.text for token in doc if token.lemma_ not in words_to_filter and not token.is_digit])

    # Apply the function to each row in the 'lemmatized_contenu' column to update the global token frequency
    df['lemmatized_contenu'].progress_apply(update_token_frequency)

    # Find the top 10 most frequent tokens
    top_10_tokens = token_freq.most_common(10)

    # Print the top 10 most frequent tokens
    print("Top 10 most frequent tokens:")
    for token, freq in top_10_tokens:
        print(f"{token}: {freq}")


## Second filtering with nltk

the tokenization of spacy is better than the one I used in R (we do the LDA in R). The one in R is equivalent to the one of nltk. And for instance aujourd'hui appears as aujourd + ' + hui. So i need to run a second filter based on nltk tokenization.

In [6]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

# Ensure NLTK resources are downloaded (needed for tokenization)
nltk.download('punkt')

# Function to tokenize with NLTK and filter based on your criteria
def nltk_tokenize_and_filter(text):
    # Tokenize the text with NLTK, ensuring the text is treated as French
    tokens = word_tokenize(text, language='french')
    # Filter tokens: convert to lowercase, exclude if in words_to_filter or is a digit
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in words_to_filter and not token.isdigit()]
    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# Assuming 'df' is your DataFrame and 'lemmatized_contenu' contains the text to process
# Apply the function to each row in the 'lemmatized_contenu' column
df['lemmatized_contenu'] = df['lemmatized_contenu'].progress_apply(nltk_tokenize_and_filter)

# This results in a new column 'filtered_contenu' in your dataframe 'df' with the text processed as per your requirements


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leopo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/488 [00:00<?, ?it/s]

## Checking words frequency based on nltk tokenization

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import pandas as pd

if compute_freq:
    # Ensure you've downloaded the NLTK tokenizer models
    nltk.download('punkt')

    # Your DataFrame 'df' should already be loaded with the 'lemmatized_contenu' column ready

    # Initialize a Counter object for token frequencies
    token_freq = Counter()

    # Define your list of words to exclude, adjusted for your context
    words_to_filter = set([
        # Your list of words to filter
    ])

    # Function to tokenize and update token frequency for a single document using NLTK
    def update_token_frequency_nltk(text):
        global token_freq  # Reference the global Counter object
        # Tokenize the text using NLTK
        tokens = word_tokenize(text, language='french')
        # Update the Counter with tokens from this document, excluding filtered words and numbers
        token_freq.update([token.lower() for token in tokens if token.lower() not in words_to_filter and not token.isdigit()])

    # Apply the function to each row in the 'lemmatized_contenu' column to update the global token frequency
    df['lemmatized_contenu'].apply(update_token_frequency_nltk)

    # Find the top 10 most frequent tokens
    top_10_tokens = token_freq.most_common(10)

    # Print the top 10 most frequent tokens
    print("Top 10 most frequent tokens using NLTK tokenization:")
    for token, freq in top_10_tokens:
        print(f"{token}: {freq}")


# Export

In [8]:
fichier = "../data/intermediate/base_lemmatized.csv"
df.to_csv(fichier, sep=';', index=False, quotechar='"')