In [37]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from textblob import TextBlob
from joblib import Parallel, delayed
from imblearn.over_sampling import SMOTE
from collections import Counter
from nltk.probability import FreqDist
from wordcloud import WordCloud
import matplotlib as plt

In [38]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/mathieu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mathieu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/mathieu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mathieu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mathieu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [39]:
fake_data = pd.read_csv('../data/Fake.csv')
true_data = pd.read_csv('../data/True.csv')

In [40]:
fake_data.info()
true_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


# Valeurs manquantes

In [41]:
missing_fake = fake_data.isnull().sum()
missing_true = true_data.isnull().sum()
missing_fake
missing_true

title      0
text       0
subject    0
date       0
dtype: int64

# Doublons

In [42]:
fake_data_cleaned = fake_data.drop_duplicates()
true_data_cleaned = true_data.drop_duplicates()

In [43]:
len(fake_data_cleaned)

23478

In [44]:
len(true_data_cleaned)

21211

# Conversion en datetime

In [45]:
fake_data_cleaned['date'] = pd.to_datetime(fake_data_cleaned['date'], errors='coerce')
true_data_cleaned['date'] = pd.to_datetime(true_data_cleaned['date'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_data_cleaned['date'] = pd.to_datetime(fake_data_cleaned['date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_data_cleaned['date'] = pd.to_datetime(true_data_cleaned['date'], errors='coerce')


# Ajout colonne label

In [46]:
fake_data_cleaned['label'] = 'Fake'
true_data_cleaned['label'] = 'True'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_data_cleaned['label'] = 'Fake'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_data_cleaned['label'] = 'True'


# Combinaison des deux jeux de données

In [47]:
combined_data = pd.concat([fake_data_cleaned, true_data_cleaned], ignore_index=True)

In [48]:
combined_data

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,Fake
...,...,...,...,...,...
44684,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,2017-08-22,True
44685,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,2017-08-22,True
44686,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,2017-08-22,True
44687,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,2017-08-22,True


In [49]:
combined_data.to_csv('../data/combined_cleaned_data.csv', index=False)

# Lemmatization

In [50]:
lemmatizer = WordNetLemmatizer()

In [51]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


In [52]:
def normalize_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer les caractères spéciaux, ponctuations, et guillemets simples et doubles
    text = re.sub(r'["\'“”‘’]', " ", text)
    text = re.sub(rf"[{string.punctuation}\"']", " ", text)
    # Tokenizer le texte
    word_tokens = word_tokenize(text)
    # Ajouter "u" aux stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('u')
    # Supprimer les stop words et lemmatiser
    filtered_text = [
        lemmatizer.lemmatize(word, get_wordnet_pos(word))
        for word in word_tokens if word not in stop_words
    ]
    return filtered_text  # Retourner une liste de mots lemmatisés

In [53]:
combined_data['title_normalized'] = combined_data['title'].apply(normalize_text)
combined_data['text_normalized'] = combined_data['text'].apply(normalize_text)

In [54]:
combined_data[['title_normalized', 'text_normalized']]

Unnamed: 0,title_normalized,text_normalized
0,"[donald, trump, sends, embarrass, new, year, e...","[donald, trump, wish, american, happy, new, ye..."
1,"[drunk, bragging, trump, staffer, start, russi...","[house, intelligence, committee, chairman, dev..."
2,"[sheriff, david, clarke, becomes, internet, jo...","[friday, reveal, former, milwaukee, sheriff, d..."
3,"[trump, obsess, even, obama, name, cod, websit...","[christmas, day, donald, trump, announce, woul..."
4,"[pope, francis, call, donald, trump, christmas...","[pope, francis, use, annual, christmas, day, m..."
...,...,...
44684,"[fully, commit, nato, back, new, approach, afg...","[brussels, reuters, nato, ally, tuesday, welco..."
44685,"[lexisnexis, withdrew, two, product, chinese, ...","[london, reuters, lexisnexis, provider, legal,..."
44686,"[minsk, cultural, hub, becomes, authority]","[minsk, reuters, shadow, disused, soviet, era,..."
44687,"[vatican, upbeat, possibility, pope, francis, ...","[moscow, reuters, vatican, secretary, state, c..."


# Fréquence

In [55]:
fdist_title = FreqDist()
fdist_text = FreqDist()

In [56]:
for words in combined_data['title_normalized']:
    fdist_title.update(words)

# Mettre à jour les fréquences avec les mots de chaque texte normalisé
for words in combined_data['text_normalized']:
    fdist_text.update(words)

In [57]:
print("Top 10 mots fréquents dans les titres :")
print(fdist_title.most_common(10))

Top 10 mots fréquents dans les titres :
[('trump', 14679), ('video', 8569), ('say', 4276), ('obama', 3179), ('hillary', 2278), ('house', 1994), ('watch', 1983), ('republican', 1834), ('clinton', 1812), ('new', 1779)]


In [58]:
print("Top 10 mots fréquents dans les textes :")
print(fdist_text.most_common(10))

Top 10 mots fréquents dans les textes :
[('say', 168226), ('trump', 133647), ('state', 62931), ('president', 56728), ('would', 54810), ('people', 41809), ('year', 41429), ('republican', 39592), ('make', 39054), ('one', 38985)]


# Colonnes pour les dates

In [59]:
combined_data['year'] = combined_data['date'].dt.year

# Extraction du mois
combined_data['month'] = combined_data['date'].dt.month

# Extraction du jour de la semaine (lundi = 0, dimanche = 6)
combined_data['day_of_week'] = combined_data['date'].dt.dayofweek

# Extraction du jour du mois
combined_data['day_of_month'] = combined_data['date'].dt.day

# Extraction de l'heure (si disponible)
if combined_data['date'].dt.hour.isna().sum() == 0:  # Vérifier si l'heure est présente
    combined_data['hour'] = combined_data['date'].dt.hour

# Extraction de la semaine de l'année
combined_data['week_of_year'] = combined_data['date'].dt.isocalendar().week

# Afficher un aperçu des nouvelles colonnes
print(combined_data[['date', 'year', 'month', 'day_of_week', 'day_of_month', 'week_of_year']].head())

        date    year  month  day_of_week  day_of_month  week_of_year
0 2017-12-31  2017.0   12.0          6.0          31.0            52
1 2017-12-31  2017.0   12.0          6.0          31.0            52
2 2017-12-30  2017.0   12.0          5.0          30.0            52
3 2017-12-29  2017.0   12.0          4.0          29.0            52
4 2017-12-25  2017.0   12.0          0.0          25.0            52


# Ne fonctionne pas (trop long)

In [None]:
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

In [None]:
num_cores = 4
combined_data['title_corrected'] = Parallel(n_jobs=num_cores)(delayed(correct_spelling)(text) for text in combined_data['title_normalized'])
combined_data['text_corrected'] = Parallel(n_jobs=num_cores)(delayed(correct_spelling)(text) for text in combined_data['text_normalized'])

In [None]:
print(combined_data[['title_corrected', 'text_corrected']].head())

# Export csv

In [64]:
combined_data.to_csv('../data/combined_data.csv', index=False)
