In [7]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Chargement des données
data = pd.read_csv("fake.csv")

# Ajout de la colonne "Label" (0 = news vérifiée, 1 = fake news)
# Si le dataset contient déjà une indication (ex: 'fake'/'real'), on le convertit
data["Label"] = data["subject"].apply(lambda x: 1 if x == "fake" else 0)

# Vérification du format des données
print(data.head())
print(data.columns)  # Vérification des colonnes après modification




                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  Label  
0  December 31, 2017      0  
1  December 31, 2017      0  
2  December 30, 2017      0  
3  December 29, 2017      0  
4  December 25, 2017      0  
Index(['title', 'text', 'subject', 'date', 'Label'], dtype='object')


In [8]:
# Affichage du nombre de lignes avant nettoyage
print(f"Nombre de lignes avant suppression des doublons : {data.shape[0]}")

# Suppression des doublons
data = data.drop_duplicates()


Nombre de lignes avant suppression des doublons : 23481


In [9]:
# Vérification après suppression des doublons
print(f"Nombre de lignes après suppression des doublons : {data.shape[0]}")

# Suppression des valeurs manquantes dans la colonne "text"
data = data.dropna(subset=['text'])

# Vérification après suppression des valeurs manquantes
print(f"Nombre de lignes après suppression des valeurs manquantes : {data.shape[0]}")

Nombre de lignes après suppression des doublons : 23478
Nombre de lignes après suppression des valeurs manquantes : 23478


In [10]:
# Nettoyage des textes : conversion en minuscules et suppression de la ponctuation
def clean_text(text):
    text = text.lower()  # Convertir en minuscules
    text = re.sub(r'[^\w\s]', '', text)  # Supprimer la ponctuation
    return text

# Application du nettoyage
data["clean_text"] = data["text"].apply(clean_text)

# Vérification après nettoyage
print(data[["text", "clean_text"]].head())

                                                text  \
0  Donald Trump just couldn t wish all Americans ...   
1  House Intelligence Committee Chairman Devin Nu...   
2  On Friday, it was revealed that former Milwauk...   
3  On Christmas day, Donald Trump announced that ...   
4  Pope Francis used his annual Christmas Day mes...   

                                          clean_text  
0  donald trump just couldn t wish all americans ...  
1  house intelligence committee chairman devin nu...  
2  on friday it was revealed that former milwauke...  
3  on christmas day donald trump announced that h...  
4  pope francis used his annual christmas day mes...  


In [11]:

# Vectorisation avec TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data["clean_text"])
y = data["Label"]  # Vérifie que "Label" existe bien dans le fichier CSV

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Préparation des données terminée avec succès !")

Préparation des données terminée avec succès !


In [12]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialisation et entraînement du modèle
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred = rf_model.predict(X_test)
print(f"Précision du modèle Random Forest : {accuracy_score(y_test, y_pred):.2f}")

# Sauvegarde du modèle
import joblib
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


Précision du modèle Random Forest : 1.00


['tfidf_vectorizer.pkl']

In [13]:
#Reseau de neurones

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Définition du modèle neuronal
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')  # Sortie binaire
])

# Compilation du modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entraînement
model.fit(X_train.toarray(), y_train, epochs=5, batch_size=32, validation_data=(X_test.toarray(), y_test))

# Sauvegarde du modèle
model.save("fake_news_nn_model.h5")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 31ms/step - accuracy: 0.9919 - loss: 0.0424 - val_accuracy: 1.0000 - val_loss: 1.2425e-05
Epoch 2/5
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 26ms/step - accuracy: 1.0000 - loss: 8.9789e-06 - val_accuracy: 1.0000 - val_loss: 2.4031e-06
Epoch 3/5
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 28ms/step - accuracy: 1.0000 - loss: 2.3349e-06 - val_accuracy: 1.0000 - val_loss: 8.9729e-07
Epoch 4/5
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - accuracy: 1.0000 - loss: 9.8915e-07 - val_accuracy: 1.0000 - val_loss: 4.3002e-07
Epoch 5/5
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 26ms/step - accuracy: 1.0000 - loss: 4.7335e-07 - val_accuracy: 1.0000 - val_loss: 2.3423e-07




In [14]:
from sklearn.metrics import classification_report

print("Évaluation Random Forest :")
print(classification_report(y_test, y_pred))

Évaluation Random Forest :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696

    accuracy                           1.00      4696
   macro avg       1.00      1.00      1.00      4696
weighted avg       1.00      1.00      1.00      4696

