## PART 0 : IMPORTATION OF LIBRARIES

In [None]:
# Importing the necessary libraries
import pandas as pd
import re
import string
import spacy
import nltk

# Downloading necessary resources
nltk.download('punkt')  # Download the Punkt tokenizer model
nltk.download('stopwords')  # Download the stopwords dataset

# Importing specific modules from NLTK
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk.stem import SnowballStemmer

# Importing additional modules
from sklearn.model_selection import train_test_split
from collections import Counter

# Downloading and loading the French spaCy language model
!python -m spacy download fr_core_news_sm # to be executed only once
nlp = spacy.load('fr_core_news_sm')

## PART 1 : IMPORTATION OF DATA AND CLEANING

In [97]:
# Define file paths for the datasets
bdd_left_link = "Tweet2.csv"
bdd_right_link = "Tweet1.csv"

# Import the data from CSV files and skip bad lines
tweet_df_left = pd.read_csv(bdd_left_link, on_bad_lines='skip', sep=';')
tweet_df_right = pd.read_csv(bdd_right_link, on_bad_lines='skip', sep=';')

In [98]:
tweet_df_left = tweet_df_left[[ 'data__text']].rename(columns={'data__text': 'content'})
tweet_df_right = tweet_df_right[[ 'data__text']].rename(columns={'data__text': 'content'})

In [99]:
# Drop data if there are duplicates or NaN tweets
tweet_df_left=tweet_df_left.drop_duplicates(subset='content')
tweet_df_right=tweet_df_right.drop_duplicates(subset='content')
tweet_df_right=tweet_df_right.dropna(subset='content')
tweet_df_left=tweet_df_left.dropna(subset='content')

### Lets look at the data

In [None]:
tweet_df_left

In [None]:
tweet_df_right

As long we can use the default index in a df, we can only keep the content of the tweet.

We can also notice that there is a big difference in the number of tweets between left and right ones. Thus, we should find a way to have an unbiased model.

# PART 2 : Cleaning the data (url, punctuation, lemmatisation, etc.)

In [None]:
nlp = spacy.load('fr_core_news_sm', disable=["parser", "ner"])

# === Precompile regular expressions ===
url_pattern = re.compile(r'(https?://|www\.)\S+')  # Pattern to detect URLs

# === Tokenizer to handle words, hashtags, mentions, and punctuation ===
token_pattern = r'\w+|#[\wàâäéèêëïîôöùûüçÀÂÄÉÈÊËÏÎÔÖÙÛÜÇ]+|@[\wàâäéèêëïîôöùûüçÀÂÄÉÈÊËÏÎÔÖÙÛÜÇ]+|[^\w\s]+'
tokenizer = RegexpTokenizer(token_pattern)

# === Load French stopwords into a set for fast lookup ===
french_stopwords = set(stopwords.words('french'))

# === Initialize the French stemmer ===
stemmer = SnowballStemmer("french")


def remove_urls(text):
    """Remove URLs from a given text string.

    Args:
        text (str): The input text.

    Returns:
        str: The text with URLs removed.
    """
    return url_pattern.sub('', text)


def process_text_batch_with_stemming(texts):
    """Process, lemmatize, and stem a batch of texts while preserving mentions and hashtags.

    Args:
        texts (list of str): List of text strings.

    Returns:
        list of tuples: Each tuple contains (tokens, lemmas, stems).
    """
    results = []

    # Process texts in batches using spaCy to optimize performance
    for doc in nlp.pipe(texts, batch_size=50):
        tokens, lemmas, stems = [], [], []  # Lists to store processed words
        hashtag = False  # Flag to track hashtags

        for token in doc:
            token_text = token.text

            # Preserve mentions (@username) as they are
            if token_text.startswith('@'):
                tokens.append(token_text)
                lemmas.append(token_text)
                stems.append(token_text)  # Mentions are not stemmed

            # Detect hashtags (#hashtag)
            elif token_text.startswith('#'):
                hashtag = True

            # Process only alphabetic words
            elif token.is_alpha:
                word = token_text.lower()

                # Ignore stopwords
                if word not in french_stopwords:
                    lemma = token.lemma_  # Get the lemmatized form

                    # If the word was part of a hashtag, reattach '#'
                    if hashtag:
                        tokens.append('#' + word)
                        lemmas.append('#' + lemma)
                        stems.append('#' + stemmer.stem(lemma))  # Apply stemming
                        hashtag = False  # Reset hashtag flag
                    else:
                        tokens.append(word)
                        lemmas.append(lemma)
                        stems.append(stemmer.stem(lemma))  # Apply stemming

        results.append((tokens, lemmas, stems))

    return results

In [None]:
def tokenize_tweets(df):
    """Tokenize and clean each tweet in a DataFrame.

    Steps:
    1. Remove URLs from the 'content' column.
    2. Process texts in batches for efficiency.
    3. Extract tokens, lemmas, and stems, and add them to the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing a 'content' column with tweets.

    Returns:
        pd.DataFrame: Updated DataFrame with new columns ('tokens', 'lemmas', 'stem').
    """
    # Step 1: Remove URLs
    df['content_clean'] = df['content'].apply(remove_urls)

    # Step 2: Process text in batches
    processed = process_text_batch_with_stemming(df['content_clean'].tolist())

    # Step 3: Unpack and store the processed data in separate DataFrame columns
    df['tokens'], df['lemmas'], df['stem'] = zip(*processed)

    return df

In [None]:
# Tokenize and clean the tweets in the test and training datasets

tokenize_tweets(tweet_df_left)
tokenize_tweets(tweet_df_right)


In [None]:
#print(tweet_df_left['tokens'])

In [None]:
#print(tweet_df_left['lemmas'])

In [None]:
#print(tweet_df_left['stem'])

The following cells allow you to visualize how the dataset was reduced through tokenization, lemmatization, and stemming.

In [None]:
# def count_different_tokens (column,unique_character_set, dict_of_occurences_counter):
#     """ count_different_tokens counts the different tokens of a dataset with the set
#         and gives their occurences with the counter.

#         Args :
#             column (df column) : column containing the tokens
#             unique_character_set (set) : empty set
#             dict_of_occurences_counter (counter) : counter dictionnary to count the occurences of tokens
#     """
#     for row in column :
#       unique_character_set.update(row)
#       dict_of_occurences_counter.update(row)

#     return unique_character_set, dict_of_occurences_counter

# left_df_tokens_set, left_df_lemmas_set, left_df_stem_set = set(), set(), set()
# right_df_tokens_set, right_df_lemmas_set, right_df_stem_set = set(), set(), set()


# left_df_tokens_counter, left_df_lemmas_counter, left_df_stem_counter = Counter(), Counter(), Counter ()
# right_df_tokens_counter, right_df_lemmas_counter, right_df_stem_counter = Counter(), Counter(), Counter ()

# count_different_tokens(tweet_df_left['tokens'], left_df_tokens_set, left_df_tokens_counter)
# count_different_tokens(tweet_df_right['tokens'], right_df_tokens_set, right_df_tokens_counter)

# count_different_tokens(tweet_df_left['lemmas'], left_df_lemmas_set, left_df_lemmas_counter)
# count_different_tokens(tweet_df_right['lemmas'], right_df_lemmas_set, right_df_lemmas_counter)

# count_different_tokens(tweet_df_left['stem'], left_df_stem_set, left_df_stem_counter)
# count_different_tokens(tweet_df_right['stem'], right_df_stem_set, right_df_stem_counter)


In [109]:
# print(f"Nombre de caractères uniques dans tokens left: {len(left_df_tokens_set)}")
# print(f"Dictionnaire des occurrences des tokens left : {left_df_tokens_counter}")
# print(f"Nombre de caractères uniques dans lemmas left: {len(left_df_lemmas_set)}")
# print(f"Dictionnaire des occurrences des lemmes left: {left_df_lemmas_counter}")
# print(f"Nombre de caractères uniques dans stems left: {len(left_df_stem_set)}")
# print(f"Dictionnaire des occurrences des stems left: {left_df_stem_counter}")

# Part 3 : Creating the  n-grams and the combined df

In order to create n-grams, you need to specify in n_list which values of n you want.
Be careful, the model's code does not adapt to n_list.

In [110]:
def n_grams(list_sentence, n):
    """Generate n-grams from a list of words."""
    return list(nltk.ngrams(list_sentence, n))

def generate_ngrams(df, n_list):
    """Create n-grams for each n in n_list and store them in new columns."""
    for n in n_list:
        df[f"{n}_grams"] = df['stem'].apply(lambda x: n_grams(x, n))

# Define n values for n-gram generation
n_list = [2, 3, 4, 5]

# Apply n-gram generation to both datasets
generate_ngrams(tweet_df_left, n_list)
generate_ngrams(tweet_df_right, n_list)


In [111]:
# Create a combined DataFrame from left and right tweets
df_combined = pd.DataFrame()

# Assign labels: 1 for left tweets, 0 for right tweets
tweet_df_left['isLeft'] = 1
tweet_df_right['isLeft'] = 0

# Concatenate both DataFrames and reset the index
df_combined = pd.concat([tweet_df_left, tweet_df_right], ignore_index=True)

df_combined

# Part 4 : Preparing for training

In [113]:
# Split left and right DataFrames into training and test sets
tweet_df_left_train, tweet_df_left_test = train_test_split(tweet_df_left, test_size=0.2, random_state=42)
tweet_df_right_train, tweet_df_right_test = train_test_split(tweet_df_right, test_size=0.55, random_state=42)

# Select specific columns for training and test sets
columns_to_keep = ["2_grams", "3_grams", "4_grams", "5_grams", "isLeft", "stem"]

tweet_df_left_train = tweet_df_left_train[columns_to_keep]
tweet_df_left_test = tweet_df_left_test[columns_to_keep]
tweet_df_right_train = tweet_df_right_train[columns_to_keep]
tweet_df_right_test = tweet_df_right_test[columns_to_keep]

# Keep only relevant columns in the combined dataset (without labels)
df_combined = df_combined[["2_grams", "3_grams", "4_grams", "5_grams", "stem"]]

In [114]:
tweet_df_right

Unnamed: 0,content,content_clean,tokens,lemmas,stem,2_grams,3_grams,4_grams,5_grams,isLeft
0,"Dans cette vidéo, je dénonçais l'arnaque des b...","Dans cette vidéo, je dénonçais l'arnaque des b...","[cette, vidéo, dénonçais, arnaque, bons, senti...","[ce, vidéo, dénoncer, arnaque, bon, sentiment,...","[ce, vidéo, dénonc, arnaqu, bon, sent, mati, i...","[(ce, vidéo), (vidéo, dénonc), (dénonc, arnaqu...","[(ce, vidéo, dénonc), (vidéo, dénonc, arnaqu),...","[(ce, vidéo, dénonc, arnaqu), (vidéo, dénonc, ...","[(ce, vidéo, dénonc, arnaqu, bon), (vidéo, dén...",0
1,Je démonte les idées reçues sur l'immigration ...,Je démonte les idées reçues sur l'immigration,"[démonte, idées, reçues, immigration]","[démonter, idée, recevoir, immigration]","[démont, idé, recevoir, immigr]","[(démont, idé), (idé, recevoir), (recevoir, im...","[(démont, idé, recevoir), (idé, recevoir, immi...","[(démont, idé, recevoir, immigr)]",[],0
2,data__text,data__text,[],[],[],[],[],[],[],0


# Modèles

In [115]:
def tuple_to_string(list_of_tuples):
    """Convert a list of n-grams (tuples) into a single space-separated string."""
    return ' '.join(['_'.join(t) for t in list_of_tuples])

# Apply function to convert n-grams to string format for each dataset
for df in [tweet_df_left_train, tweet_df_left_test, tweet_df_right_train, tweet_df_right_test]:
    df["total_n_grams"] = df[["2_grams", "3_grams", "4_grams", "5_grams"]].apply(
        lambda row: ' '.join(row.apply(tuple_to_string)), axis=1
    )

# Concatenate left and right datasets to form train and test sets
df_train = pd.concat([tweet_df_left_train, tweet_df_right_train], ignore_index=True)
df_test = pd.concat([tweet_df_left_test, tweet_df_right_test], ignore_index=True)


In [None]:
df_test.head(3)

## Logistic Regression

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [117]:
vectorizer = CountVectorizer(ngram_range=(1, 1))
X_train = vectorizer.fit_transform(df_train["total_n_grams"])
y_train = df_train["isLeft"]
X_test = vectorizer.transform(df_test["total_n_grams"])
y_test = df_test["isLeft"]

In [None]:
# Initialisation du modèle
model = LogisticRegression()

# Entraînement du modèle
model.fit(X_train, y_train)

In [None]:
ypred = model.predict(X_test)
print(accuracy_score(y_test,ypred))

## RNN

In [120]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
# Find the maximum row length and unique words in 'stem' column to define the RNN parameters
max_length = 0
unique_words = set()

for row in df_train['stem']:
    max_length = max(len(row), max_length)
    unique_words.update(row)

# Print results
print(f"Number of unique words: {len(unique_words)}")
print(f"Maximum row length: {max_length}")


In [128]:
# Define vocabulary size, maximum sequence length, and embedding dimension
max_vocab_size = len(unique_words)
max_sequence_length = max_length
embedding_dim = 100

In [130]:
# Initialize and fit the tokenizer on the training data
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")  # Handle out-of-vocabulary words
tokenizer.fit_on_texts(df_train['stem'])

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(df_train['stem'])
X_test_seq  = tokenizer.texts_to_sequences(df_test['stem'])

# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post', truncating='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post', truncating='post')


In [None]:
# === Build the RNN Model ===
model = Sequential()

# Embedding layer to convert word indices into dense vectors
model.add(Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))

# Bidirectional LSTM to capture dependencies in both directions
model.add(Bidirectional(LSTM(64)))

# Dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# === Compile the Model ===
optimizer = Adam(learning_rate=0.001)

# Reduce learning rate when validation loss stops improving
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Display model architecture
model.summary()

# Early stopping to prevent unnecessary training when validation loss stops improving
#early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=1)

In [None]:
# Training of the model
history = model.fit(X_train_pad, y_train, batch_size=32, epochs=12, validation_split=0.2, callbacks=[lr_scheduler])

In [None]:
# Évaluation du modèle sur le jeu de test
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test accuracy:", accuracy)

## LLM's solution

### GPT

#### ChatGpt LinearRegression

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

###############################################################################
# 1) Exemples de fonctions de transformation pour fusionner 2_grams et 3_grams
###############################################################################
def convert_ngrams_to_string(row):
    """
    Convertit la liste de tuples (2_grams) et la liste de tuples (3_grams)
    en une unique chaîne de caractères, pour pouvoir utiliser CountVectorizer.
    """
    # row["2_grams"] est une liste de tuples (ex: [('avec','yoyo'), ('je','mange'), ...])
    # On va transformer chaque tuple ('avec','yoyo') en 'avec_yoyo'
    two_grams_str  = ["_".join(tup) for tup in row["2_grams"]]
    three_grams_str = ["_".join(tup) for tup in row["3_grams"]]

    # On fusionne les deux listes sous forme d'une seule chaîne
    # Exemple : "avec_yoyo je_mange ... je_mange_en classe_de_francaise ..."
    combined_str = " ".join(two_grams_str + three_grams_str)
    return combined_str


###############################################################################
# 2) Préparation des données d'entraînement et de test
###############################################################################
# Supposons que vos dataframes soient déjà définis :
#   tweet_df_left_train, tweet_df_left_test,
#   tweet_df_right_train, tweet_df_right_test
#
# et qu'ils ont chacun les colonnes : ["2_grams", "3_grams", "isLeft"].

# Optionnellement, vous pouvez concaténer left et right si vous le souhaitez :
df_train = pd.concat([tweet_df_left_train, tweet_df_right_train], ignore_index=True)
df_test  = pd.concat([tweet_df_left_test, tweet_df_right_test], ignore_index=True)

# On applique la fonction de conversion pour créer une nouvelle colonne textuelle
df_train["text_ngrams"] = df_train.apply(convert_ngrams_to_string, axis=1)
df_test["text_ngrams"]  = df_test.apply(convert_ngrams_to_string, axis=1)

# X et y pour l'entraînement
X_train_text = df_train["text_ngrams"]  # données textuelles vectorisables
y_train = df_train["isLeft"]            # cible binaire (0 ou 1)

# X et y pour le test
X_test_text = df_test["text_ngrams"]
y_test = df_test["isLeft"]


###############################################################################
# 3) Vectorisation (CountVectorizer) + Entraînement du modèle (LinearRegression)
###############################################################################
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_text)
X_test_vec  = vectorizer.transform(X_test_text)

# Instanciation et entraînement de la régression linéaire
model = LinearRegression()
model.fit(X_train_vec, y_train)

###############################################################################
# 4) Prédictions et évaluation
###############################################################################
y_pred = model.predict(X_test_vec)

# Pour la régression linéaire, calculons MSE et R²
mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print(f"Mean Squared Error : {mse:.4f}")
print(f"R^2 Score          : {r2:.4f}")

# Si vous souhaitez convertir la prédiction en 0/1, vous pouvez faire
# par exemple un seuil à 0.5 :
y_pred_class = [1 if val >= 0.5 else 0 for val in y_pred]

# Taux de bonne classification (accuracy) éventuelle si vous voulez un score simple :
accuracy = sum(y_pred_class == y_test) / len(y_test)
print(f"Accuracy (avec seuil 0.5) : {accuracy:.4f}")

In [None]:
count_set=set()
for row in df_test['2_grams']:
  for n_gram in row:
    count_set.add(n_gram)

for row in df_test['3_grams']:
  for n_gram in row:
    count_set.add(n_gram)

print(len(count_set))

#### ChatGpt LogisticRegression

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

###############################################################################
# 1) Exemple de fonction de transformation : listes de tuples -> chaîne de caractères
###############################################################################
def convert_ngrams_to_string(row):
    """
    Convertit la liste de tuples (2_grams) et la liste de tuples (3_grams)
    en une unique chaîne de caractères, pour pouvoir utiliser CountVectorizer.
    """
    # row["2_grams"] est une liste de tuples (ex: [('avec','yoyo'), ('je','mange'), ...])
    # row["3_grams"] est également une liste de tuples
    two_grams_str  = ["_".join(tup) for tup in row["2_grams"]]
    three_grams_str = ["_".join(tup) for tup in row["3_grams"]]

    # On fusionne les deux listes sous forme d'une seule chaîne
    combined_str = " ".join(two_grams_str + three_grams_str)
    return combined_str

###############################################################################
# 2) Supposons que vos DataFrames soient déjà chargés :
#    tweet_df_left_train, tweet_df_left_test, tweet_df_right_train, tweet_df_right_test
#    chacun ayant ["2_grams", "3_grams", "isLeft"] comme colonnes.
###############################################################################
# Exemple : On les combine pour obtenir un dataset global de train / test

# On fusionne les dataframes pour l'entraînement
df_train = pd.concat([tweet_df_left_train, tweet_df_right_train], ignore_index=True)
df_test  = pd.concat([tweet_df_left_test, tweet_df_right_test], ignore_index=True)

# Application de la fonction de conversion pour créer une colonne textuelle
df_train["text_ngrams"] = df_train.apply(convert_ngrams_to_string, axis=1)
df_test["text_ngrams"]  = df_test.apply(convert_ngrams_to_string, axis=1)

# Séparation X / y pour l'entraînement
X_train = df_train["text_ngrams"]
y_train = df_train["isLeft"]  # binaire : 0 ou 1

# Séparation X / y pour le test
X_test = df_test["text_ngrams"]
y_test = df_test["isLeft"]

###############################################################################
# 3) Vectorisation + Entraînement du modèle (LogisticRegression)
###############################################################################
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

# Instanciation de la régression logistique
model = LogisticRegression(max_iter=1000)  # on peut augmenter max_iter si besoin
model.fit(X_train_vec, y_train)

###############################################################################
# 4) Prédiction et évaluation
###############################################################################
y_pred = model.predict(X_test_vec)

# Calcul de l'accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy : {accuracy:.4f}")

# Matrice de confusion
cm = confusion_matrix(y_test, y_pred)
print("Matrice de confusion :")
print(cm)

# Rapport de classification (précision, rappel, f1-score)
print("Rapport de classification :")
print(classification_report(y_test, y_pred))


### Deepseek

#### Deepseek regression

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression

# Fonction pour convertir les tuples en chaînes
def process_ngram_column(df, column_name):
    return df[column_name].apply(
        lambda x: ['_'.join(logram) for gram in x] if isinstance(x, list) else []
    )

# Appliquer le prétraitement sur toutes les DataFrames
for df in [tweet_df_left_train, tweet_df_left_test, tweet_df_right_train, tweet_df_right_test]:
    df["2_grams_str"] = process_ngram_column(df, "2_grams")
    df["3_grams_str"] = process_ngram_column(df, "3_grams")
    df["all_ngrams"] = df["2_grams_str"] + df["3_grams_str"]

# Combiner les données d'entraînement
train_data = pd.concat([tweet_df_left_train, tweet_df_right_train])
test_data = pd.concat([tweet_df_left_test, tweet_df_right_test])

# Créer un texte unique par tweet
train_data["text"] = train_data["all_ngrams"].apply(lambda x: " ".join(x))
test_data["text"] = test_data["all_ngrams"].apply(lambda x: " ".join(x))

# Vectoriser avec CountVectorizer
vectorizer = CountVectorizer(max_features=500000)  # Limiter les features pour éviter l'explosion dimensionnelle
X_train = vectorizer.fit_transform(train_data["text"])
X_test = vectorizer.transform(test_data["text"])
y_train = train_data["isLeft"]
y_test = test_data["isLeft"]

# Entraîner la régression linéaire
model = LinearRegression()
model.fit(X_train, y_train)

# Prédiction et évaluation
predictions = model.predict(X_test)
print("Coefficients importants :", dict(zip(vectorizer.get_feature_names_out(), model.coef_.round(2))))

from sklearn.metrics import mean_squared_error, r2_score

print(f"Mean Squared Error: {mean_squared_error(y_test, predictions):.3f}")
print(f"R² Score: {r2_score(y_test, predictions):.3f}")

y_pred_class = [1 if val >= 0.5 else 0 for val in predictions]
accuracy = sum(y_pred_class == y_test) / len(y_test)
print(f"Accuracy (avec seuil 0.5) : {accuracy:.4f}")

#### DeepSeek classification

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# ----------------------------------------------------------
# 1. Fonction de prétraitement des n-grams
# ----------------------------------------------------------

def process_ngrams(df):
    # Copie du dataframe pour éviter les modifications inplace
    df = df.copy()

    # Conversion des tuples en chaînes de caractères
    for col in ['2_grams', '3_grams']:
        df[col] = df[col].apply(
            lambda x: ['_'.join(gram) for gram in x] if isinstance(x, list) else []
        )

    # Combinaison de tous les n-grams en une seule chaîne
    df['all_ngrams'] = df['2_grams'] + df['3_grams']
    df['text'] = df['all_ngrams'].apply(lambda x: ' '.join(x))

    return df

# ----------------------------------------------------------
# 2. Application du prétraitement sur tous les datasets
# ----------------------------------------------------------

# Traitement des données d'entraînement
train_left = process_ngrams(tweet_df_left_train)
train_right = process_ngrams(tweet_df_right_train)
train_data = pd.concat([train_left, train_right], axis=0)

# Traitement des données de test
test_left = process_ngrams(tweet_df_left_test)
test_right = process_ngrams(tweet_df_right_test)
test_data = pd.concat([test_left, test_right], axis=0)

# ----------------------------------------------------------
# 3. Vectorisation des caractéristiques
# ----------------------------------------------------------

vectorizer = CountVectorizer(
    max_features=1000000,  # Limite le nombre de features
    ngram_range=(1, 1)  # Conserve les n-grams individuels
)

X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])
y_train = train_data['isLeft']
y_test = test_data['isLeft']

# ----------------------------------------------------------
# 4. Entraînement du modèle de classification
# ----------------------------------------------------------

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',  # Gère les déséquilibres de classes
    random_state=42
)

model.fit(X_train, y_train)

# ----------------------------------------------------------
# 5. Évaluation du modèle
# ----------------------------------------------------------

# Prédictions
y_pred = model.predict(X_test)

# Métriques
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.3f}")
print("\nMatrice de confusion:")
print(confusion_matrix(y_test, y_pred))

# ----------------------------------------------------------
# 6. Analyse des features importantes (optionnel)
# ----------------------------------------------------------

# Récupération des coefficients
feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_[0]

# Création d'un dataframe d'analyse
coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefs,
    'abs_coef': abs(coefs)
})

# Top 10 des features les plus importantes pour chaque classe
print("\nTop 10 features pro-Left:")
print(coef_df.sort_values('coefficient', ascending=False).head(10)[['feature', 'coefficient']])

print("\nTop 10 features pro-Right:")
print(coef_df.sort_values('coefficient', ascending=True).head(10)[['feature', 'coefficient']])

# Ressources

In [None]:
!python -m spacy download fr_core_news_sm # Download the 'fr_core_news_sm' model
nlp = spacy.load('fr_core_news_sm')

# Precompile the module regexp for urls
url_pattern = re.compile(r'(https?://|www\.)\S+')

# Define a tokenizer for word, hastags, @ and punctuation
pattern = r'\w+|#[\wàâäéèêëïîôöùûüçÀÂÄÉÈÊËÏÎÔÖÙÛÜÇ]+|@[\wàâäéèêëïîôöùûüçÀÂÄÉÈÊËÏÎÔÖÙÛÜÇ]+|[^\w\s]+'
tokenizer = RegexpTokenizer(pattern)

# Load french stop words
french_stopwords = set(stopwords.words('french'))

def remove_urls(text):
    """Delete the urls of the text"""
    return url_pattern.sub('', text)

def lemmatize (word_lowered):
   if word_lowered not in french_stopwords:
     filtered=word_lowered
     if not word_lowered.startswith(('#', '@')):
        return (filtered, nlp(word_lowered)[0].lemma_)  # If the word is not a mention or topic we lemmatize
     else:
        return (filtered, word_lowered)

def remove_and_lemmatize (text):
    """Delete punctuation and stopwords, then tokenize and lemmatize .
       Words starting with # and @ are unchanged."""
    tokens = tokenizer.tokenize(text)                             # Tokenize the text with our regexp
    filtered = []
    lemmatized = []
    for word in tokens:
        if word.isalpha() or word.startswith(('#', '@')):         # If it is a word, mention or topic (@ or #)
          word_lowered = word.lower()
          a = lemmatize(word_lowered)
          if a is not None :
            word_filtered, word_lemmatized = a    # If it is, we just add it without lemmatizing
            filtered.append(word_filtered)
            lemmatized.append(word_lemmatized)
    return filtered, lemmatized

def tokenize_tweets(df):
    """Tokeniser et nettoyer chaque tweet dans le DataFrame."""
    # We remove urls, punctuation and stopwords
    df['clean_content'] = df['content'].apply(remove_urls).apply(remove_and_lemmatize)
    # Creat columns for tokens and words lemmatized
    df[['tokens', 'lemmas']] = pd.DataFrame(df['clean_content'].tolist(), index=df.index)

In [None]:
# Precompile URL regex pattern
url_pattern = re.compile(r'(https?://|www\.)\S+')

# Define tokenizer pattern for words, hashtags, mentions, and punctuation
pattern = r'\w+|#[\wàâäéèêëïîôöùûüçÀÂÄÉÈÊËÏÎÔÖÙÛÜÇ]+|@[\wàâäéèêëïîôöùûüçÀÂÄÉÈÊËÏÎÔÖÙÛÜÇ]+|[^\w\s]+'
tokenizer = RegexpTokenizer(pattern)

# Load French stopwords once and convert to a set for faster lookup
french_stopwords = set(stopwords.words('french'))

def remove_urls(text):
    """Remove URLs from strings.

    Args:
        text (str): The input text.

    Returns:
        str: The text with URLs removed.
    """
    return url_pattern.sub('', text)

def remove_punctuation(text):
    """Remove punctuation and stopwords, then tokenize and lemmatize.

    Args:
        text (str): The input text to be tokenized and cleaned.

    Returns:
        tuple: A tuple containing two lists:
            - A list of filtered tokens (words).
            - A list of lemmatized words.
    """
    tokens = tokenizer.tokenize(text)
    global mots_lem_g
    filtered = []
    lemmatized = []
    for word in tokens:
        if word.isalpha() or word.startswith(('#', '@')):  # Check if it's a word or starts with '#' or '@'
            word_lower = word.lower()
            if word_lower not in french_stopwords:  # Remove stopwords
                filtered.append(word_lower)
                if not word_lower.startswith(('#', '@')):  # Lemmatize non-hashtags/mentions
                    lemmatized.append(nlp(word_lower)[0].lemma_)
                else:
                    lemmatized.append(word_lower)  # Keep hashtags and mentions unchanged
    mots_lem_g.extend(lemmatized)  # Add lemmatized words to the global list
    return filtered, lemmatized

def tokenize_tweets(tweets):
    """Tokenize and clean each tweet in the dictionary.

    Args:
        tweets (dict): A dictionary where keys are tweet indices and values are tweet data (including text).

    Returns:
        None: This function updates the 'tweets' dictionary in place with tokenized and lemmatized text.
    """
    for tweet in tweets.values():
        text = tweet.get('text', '')  # Get tweet text
        text_no_urls = remove_urls(text)  # Remove URLs
        tokens, lemmas = remove_punctuation(text_no_urls)  # Remove punctuation and lemmatize
        tweet['text_tokenise'] = tokens  # Store tokens
        tweet['mots_lemmatises'] = lemmas  # Store lemmatized words
