In [1]:
# Importation des bibliothèques
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import Word
import nltk

# Téléchargement des ressources NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Chargement des données
df = pd.read_csv('data/reviews.csv')
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/menphis/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/menphis/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,product_id,user_id,rating,review_text
0,101,1,5,Great product! Really satisfied with the quali...
1,102,2,4,"Good product, but the delivery was late."
2,103,3,2,Not as expected. The product stopped working w...
3,104,4,3,Average quality. It does the job but nothing s...
4,105,5,1,Very disappointed. The product broke after jus...


In [2]:
# Aperçu des données
print("Aperçu des 5 premiers avis :")
df[['review_text']].head()

Aperçu des 5 premiers avis :


Unnamed: 0,review_text
0,Great product! Really satisfied with the quali...
1,"Good product, but the delivery was late."
2,Not as expected. The product stopped working w...
3,Average quality. It does the job but nothing s...
4,Very disappointed. The product broke after jus...


In [3]:
# Fonction de nettoyage de texte
def clean_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer les caractères spéciaux et les chiffres
    text = re.sub(r'\d+', '', text)  # Enlever les chiffres
    text = re.sub(r'[^\w\s]', '', text)  # Enlever la ponctuation
    return text

# Application du nettoyage sur la colonne review_text
df['cleaned_text'] = df['review_text'].apply(clean_text)
df[['review_text', 'cleaned_text']].head()

Unnamed: 0,review_text,cleaned_text
0,Great product! Really satisfied with the quali...,great product really satisfied with the qualit...
1,"Good product, but the delivery was late.",good product but the delivery was late
2,Not as expected. The product stopped working w...,not as expected the product stopped working wi...
3,Average quality. It does the job but nothing s...,average quality it does the job but nothing sp...
4,Very disappointed. The product broke after jus...,very disappointed the product broke after just...


In [4]:
# Liste des mots vides en anglais
stop_words = set(stopwords.words('english'))

# Fonction de suppression des mots vides
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_words = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_words)

# Application de la suppression des mots vides
df['text_no_stopwords'] = df['cleaned_text'].apply(remove_stopwords)
df[['cleaned_text', 'text_no_stopwords']].head()

Unnamed: 0,cleaned_text,text_no_stopwords
0,great product really satisfied with the qualit...,great product really satisfied quality perform...
1,good product but the delivery was late,good product delivery late
2,not as expected the product stopped working wi...,expected product stopped working within week
3,average quality it does the job but nothing sp...,average quality job nothing special
4,very disappointed the product broke after just...,disappointed product broke one use


In [5]:
# Télécharger les ressources WordNet pour la lemmatisation
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Fonction de lemmatisation
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmatized_words)

# Application de la lemmatisation
df['lemmatized_text'] = df['text_no_stopwords'].apply(lemmatize_text)
df[['text_no_stopwords', 'lemmatized_text']].head()

[nltk_data] Downloading package wordnet to /Users/menphis/nltk_data...


Unnamed: 0,text_no_stopwords,lemmatized_text
0,great product really satisfied quality perform...,great product really satisfied quality perform...
1,good product delivery late,good product delivery late
2,expected product stopped working within week,expected product stopped working within week
3,average quality job nothing special,average quality job nothing special
4,disappointed product broke one use,disappointed product broke one use


In [6]:
# Bag of Words (BOW)
bow_vectorizer = CountVectorizer(max_features=1000)  # Limitez les caractéristiques pour simplifier
bow_features = bow_vectorizer.fit_transform(df['lemmatized_text'])
bow_df = pd.DataFrame(bow_features.toarray(), columns=bow_vectorizer.get_feature_names_out())
print("Aperçu des caractéristiques BOW :")
bow_df.head()

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf_vectorizer.fit_transform(df['lemmatized_text'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("Aperçu des caractéristiques TF-IDF :")
tfidf_df.head()

Aperçu des caractéristiques BOW :
Aperçu des caractéristiques TF-IDF :


Unnamed: 0,absolutely,anyone,average,better,bit,broke,buy,could,decent,definitely,...,stopped,terrible,use,way,week,well,within,working,worth,would
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.429784,0.0,0.0,0.0,0.429784,0.0,0.429784,0.429784,0.0,0.0
3,0.0,0.0,0.475988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.475988,0.0,0.0,0.0,0.0,...,0.0,0.0,0.475988,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Explications des Étapes de Prétraitement

1. **Nettoyage de texte** : Conversion en minuscules, suppression des caractères spéciaux et de la ponctuation pour normaliser les données textuelles.
2. **Suppression des mots vides** : Les mots vides sont des mots courants (par exemple "le", "et", "ou") qui n'ajoutent pas de valeur contextuelle.
3. **Lemmatisation** : Elle permet de réduire les mots à leur forme de base (ex. "running" devient "run"), unifiant ainsi les variations.
4. **Vectorisation BOW et TF-IDF** :
   - **Bag of Words (BOW)** : Représente chaque mot par sa fréquence d’apparition dans les textes.
   - **TF-IDF** : Prend en compte l’importance des mots en fonction de leur fréquence dans l’ensemble du corpus, ce qui permet de mieux peser les termes.

Les étapes ci-dessus aident à transformer les textes en données numériques exploitables par des modèles de machine learning.