# The notebook to preprocess articles data

##  1) Import and download all necessary data

In [8]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nathanwandji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nathanwandji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nathanwandji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 2) Processsing functions 

### a) Fonction pour normaliser le texte

In [3]:
def normalize_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Suppresion la ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

### b) Fonction pour tokeniser le texte

In [4]:
def tokenize_text(text):
    return word_tokenize(text)

### c) Fonction pour retirer les stop words

In [5]:
def remove_stop_words(tokens):
    stop_words_set = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words_set]

### d) Fonction pour lemmatiser les tokens

In [6]:
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

## e) Fonction de traitement globale 

In [7]:
def process_text_in_dataframe(df, column):
    # Normalisation
    df[column] = df[column].apply(normalize_text)
    
    # Tokenisation
    df[f'{column}_processed'] = df[column].apply(tokenize_text)
    
    # Suppression des mots vides
    df[f'{column}_processed'] = df[f'{column}_processed'].apply(remove_stop_words)
    
    # Lemmatisation
    df[f'{column}_processed'] = df[f'{column}_processed'].apply(lemmatize_tokens)
    
    # Rejoindre les tokens lemmatisés en une chaîne de caractères
    df[f'{column}_processed'] = df[f'{column}_processed'].apply(lambda tokens: ' '.join(tokens))
    
    return df

In [12]:
# Charger les données

df_wiki_article = pd.read_csv('wilkipedia_articles.csv')

In [13]:
df_wiki_article.head()

Unnamed: 0,title,url,extract,publication_date,references
0,Animal,https://en.wikipedia.org/wiki/Animal,"Animals are multicellular, eukaryotic organism...",2024-01-29,['https://en.oxforddictionaries.com/definition...
1,The Animals,https://en.wikipedia.org/wiki/The_Animals,The Animals (also billed as Eric Burdon and th...,2024-02-03,['https://www.rollingstone.com/music/artists/t...
2,"Animals, Animals, Animals","https://en.wikipedia.org/wiki/Animals,_Animals...","Animals, Animals, Animals is a 1976–1981 educa...",2023-02-15,"['https://www.imdb.com/title/tt0231010/', 'htt..."
3,Fastest animals,https://en.wikipedia.org/wiki/Fastest_animals,This is a list of the fastest animals in the w...,2024-01-15,['https://www.sciencedaily.com/releases/2012/0...
4,Animal rights,https://en.wikipedia.org/wiki/Animal_rights,Animal rights is the philosophy according to w...,2024-01-20,['http://www.city-journal.org/html/10_3_urbani...


In [14]:
# Use the following code to test the function on "extratct" column
df_wiki_article_preprocessed = process_text_in_dataframe(df_wiki_article, 'extract')

In [15]:
df_wiki_article_preprocessed.head()

Unnamed: 0,title,url,extract,publication_date,references,extract_processed
0,Animal,https://en.wikipedia.org/wiki/Animal,animals are multicellular eukaryotic organisms...,2024-01-29,['https://en.oxforddictionaries.com/definition...,animal multicellular eukaryotic organism biolo...
1,The Animals,https://en.wikipedia.org/wiki/The_Animals,the animals also billed as eric burdon and the...,2024-02-03,['https://www.rollingstone.com/music/artists/t...,animal also billed eric burdon animal english ...
2,"Animals, Animals, Animals","https://en.wikipedia.org/wiki/Animals,_Animals...",animals animals animals is a 1976–1981 educati...,2023-02-15,"['https://www.imdb.com/title/tt0231010/', 'htt...",animal animal animal 1976–1981 educational tel...
3,Fastest animals,https://en.wikipedia.org/wiki/Fastest_animals,this is a list of the fastest animals in the w...,2024-01-15,['https://www.sciencedaily.com/releases/2012/0...,list fastest animal world type animal
4,Animal rights,https://en.wikipedia.org/wiki/Animal_rights,animal rights is the philosophy according to w...,2024-01-20,['http://www.city-journal.org/html/10_3_urbani...,animal right philosophy according many sentien...


## 3) Extraction and indexation of features 