In [None]:
# Projet : Détection de Fake News avec NLP 

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter
import string
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import swifter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, confusion_matrix, roc_curve, auc, 
                            RocCurveDisplay, ConfusionMatrixDisplay)
from tensorflow.keras.models import load_model
import shap
import lime
from lime import lime_tabular
import os
from gensim.models import Word2Vec


In [None]:
def load_data(fake_path, true_path):
    # Charger le dataset
    fake_news = pd.read_csv(fake_path)
    true_news = pd.read_csv(true_path)
    
    # Ajouter les labels
    fake_news['label'] = 0  # 0 pour les fausses news
    true_news['label'] = 1  # 1 pour les vraies news
    
    combined_df = pd.concat([true_news, fake_news], axis=0)
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
    return combined_df

def analyze_data(df):
    class_distribution = df['label'].value_counts()
    print("Distribution des classes:")
    print(class_distribution)

    df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))
    mean_length = df.groupby('label')['text_length'].mean()
    print("\nLongueur moyenne des textes (en mots):")
    print(mean_length)

    stop_words = set(stopwords.words('english'))
    fake_words = get_top_words(df[df['label'] == 0]['text'])
    true_words = get_top_words(df[df['label'] == 1]['text'])
    
    return fake_words, true_words



def get_top_words(text_series, n=20):
    stop_words = set(stopwords.words('english'))
    all_words = ' '.join(text_series).split()
    filtered_words = [word.lower() for word in all_words if word.lower() not in stop_words and word.isalpha()]
    return Counter(filtered_words).most_common(n)



def donnee_manquante(df):
    print("Valeurs manquantes par colonne:")
    print(df.isnull().sum())

    print(f"Nombre d'articles avant suppression des doublons: {len(df)}")
    df.drop_duplicates(subset=['text'], keep='first', inplace=True)
    print(f"Nombre d'articles après suppression: {len(df)}")



def unified_text_processor(df, text_col='text'):
    # Initialisation des outils NLP
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    def clean_and_tokenize(text):
        # Nettoyage de base
        text = str(text).lower()
        text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenization avancée
        tokens = word_tokenize(text)
        
        # Lemmatisation et filtrage
        clean_tokens = [
            lemmatizer.lemmatize(word) 
            for word in tokens 
            if word not in stop_words and len(word) > 2
        ]
        
        return clean_tokens
    
    # Application principale
    df['tokens'] = df[text_col].swifter.apply(clean_and_tokenize)
    df['clean_text'] = df['tokens'].apply(' '.join)
    
    # Features quantitatives
    df['word_count'] = df['tokens'].apply(len)
    df['char_count'] = df['clean_text'].apply(len)
    
    # Features linguistiques (sur texte nettoyé)
    df['noun_count'] = df['tokens'].apply(
        lambda x: sum(1 for _, pos in nltk.pos_tag(x) if pos.startswith('NN'))
    )
    
    # Sentiment analysis
    df['sentiment'] = df['clean_text'].swifter.apply(
        lambda x: TextBlob(x).sentiment.polarity
    )
    
    return df

def get_final_df():

    FAKE_PATH = "../dataSet/Fake.csv"
    TRUE_PATH = "../dataSet/True.csv"
    print("Chargement des données...")
    df = load_data(FAKE_PATH, TRUE_PATH)
    print(df.head())

    print("\nAnalyse des données...")
    fake_words, true_words = analyze_data(df)
    print("\nMots fréquents (Fake):", fake_words)
    print("\nMots fréquents (True):", true_words)

    donnee_manquante(df)


    # 4. Traitement NLP complet
    print("\nTraitement NLP avancé...")
    final_df = unified_text_processor(df)
    print(final_df[['clean_text', 'word_count', 'noun_count', 'sentiment']].head())
    
    return final_df

In [None]:
print(get_final_df())

In [None]:
def train_word2vec(combined_df):
    """Entraîne un modèle Word2Vec sur les tokens"""
    sentences = combined_df['tokens'].tolist()
    
    model =  Word2Vec(
        sentences,
        vector_size=300,
        window=10,
        min_count=3,
        negative=10,
        hs=1,
        sample=1e-5,
        workers=8,
        epochs=20
)
    
    return model

def create_embeddings(model, combined_df):
    """Crée des embeddings moyens pour chaque document"""
    def document_vector(tokens):
        words = [word for word in tokens if word in model.wv]
        return np.mean(model.wv[words], axis=0) if words else np.zeros(model.vector_size)
    
    combined_df['w2v_embedding'] = combined_df['tokens'].apply(document_vector)
    return combined_df



In [None]:
df = get_final_df()  # Appel direct à votre fonction de data.py
    
# 2. Entraînement du modèle
w2v_model = train_word2vec(df)

# 3. Création des embeddings
df_with_embeddings = create_embeddings(w2v_model, df)

# 4. Sauvegarde
os.makedirs("models", exist_ok=True)
w2v_model.save("models/w2v_fake_news.model")
df_with_embeddings.to_pickle("processed_data/news_with_embeddings.pkl")

print("Modèle Word2Vec et embeddings créés avec succès!")