In [1]:
!pip install pandas google-play-scraper deep-translator emoji nltk spacy
!python -m nltk.downloader stopwords
!python -m spacy download en_core_web_sm


Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-play-scraper, emoji, deep-translator
Successfully installed deep-translator-1.11.4 emoji-2.14.1 google-play-scraper-1.2.7
[nltk_d

In [3]:
import pandas as pd
import re
import emoji
import nltk
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator
import spacy
from tqdm.notebook import tqdm

# Initialisation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

# Liste des noms d'application à supprimer
app_names = ['pubg', 'cod', 'mc5', 'call of duty', 'modern combat']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def clean_text(text):
    try:
        # Traduction en anglais
        text = GoogleTranslator(source='auto', target='en').translate(text)

        # Suppression des emojis
        text = emoji.replace_emoji(text, replace='')

        # Suppression des sauts de ligne
        text = text.replace("\n", " ").replace("\r", " ")

        # Suppression des liens web
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)

        # Suppression des nombres
        text = re.sub(r"\d+", "", text)

        # Suppression des whitespaces
        text = re.sub(r'\s+', ' ', text).strip()

        # Conversion en minuscules
        text = text.lower()

        # Suppression des noms d'application
        for app in app_names:
            text = text.replace(app.lower(), '')

        # Suppression des ponctuations
        text = re.sub(r'[^\w\s]', '', text)

        # Suppression des stopwords
        words = [word for word in text.split() if word not in stop_words]

        # Lemmatisation
        doc = nlp(" ".join(words))
        lemmas = [token.lemma_ for token in doc if not token.is_stop]

        return " ".join(lemmas)

    except Exception as e:
        return ""  # return vide si erreur de traitement


In [5]:
df = pd.read_csv("/content/avis_google_play.csv")

# Application du prétraitement avec barre de progression
tqdm.pandas()
df['clean_avis'] = df['avis'].progress_apply(clean_text)

# Suppression des avis trop courts (moins de 3 mots)
df = df[df['clean_avis'].str.split().str.len() >= 3]

# Réinitialiser les index
df = df.reset_index(drop=True)

# Sauvegarde du fichier nettoyé
df.to_csv("avis_google_play_clean.csv", index=False, encoding='utf-8')
print("✅ Fichier nettoyé sauvegardé sous : avis_google_play_clean.csv")


  0%|          | 0/3000 [00:00<?, ?it/s]

✅ Fichier nettoyé sauvegardé sous : avis_google_play_clean.csv
