In [9]:
import pandas as pd
import re
import unicodedata
try:
    from langdetect import detect
except Exception:
    detect = None  # le code fonctionnera sans, mais 'lang' sera 'unk'

In [3]:
URL_RE = re.compile(r"https?://\S+|www\.\S+")
HTML_RE = re.compile(r"<[^>]+>")
MULTI_SPACE_RE = re.compile(r"\s+")
EMAIL_RE = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w{2,}\b")
PHONE_RE = re.compile(r"\b(?:\+?\d[\s\-\(\)]*){7,}\b")
PUNCT_RUNS_RE = re.compile(r"([!?.,]){2,}")
EMOJI_RE = re.compile(
    "["                     # bloc emojis courants
    "\U0001F600-\U0001F64F" # emoticones
    "\U0001F300-\U0001F5FF" # symboles & pictos
    "\U0001F680-\U0001F6FF" # transport & cartes
    "\U0001F1E0-\U0001F1FF" # flags
    "]+", flags=re.UNICODE
)


In [4]:
def strip_accents(text: str) -> str:
    """Supprime les accents (utile pour certaines normalisations ou matching)."""
    text = unicodedata.normalize("NFD", text)
    text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
    return unicodedata.normalize("NFC", text)

In [5]:
def basic_clean(text: str, to_lower: bool = False, keep_accents: bool = True) -> str:
    """Nettoyage l√©ger conservant le sens (safe pour mod√®les transformers)."""
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)

    # Retirer HTML & URLs & emails & num√©ros de tel
    text = HTML_RE.sub(" ", text)
    text = URL_RE.sub(" ", text)
    text = EMAIL_RE.sub(" ", text)
    # ‚ö†Ô∏è Optionnel : retirer N¬∞ de t√©l√©phone (peut √™tre pertinent de garder si utile)
    text = PHONE_RE.sub(" ", text)

    # Unifier apostrophes/quotes exotiques
    text = text.replace("‚Äô", "'").replace("‚Äú", '"').replace("‚Äù", '"')

    # R√©duire les r√©p√©titions de ponctuation "!!!", "???" ‚Üí "!"
    text = PUNCT_RUNS_RE.sub(lambda m: m.group(1), text)

    # Normaliser espaces
    text = MULTI_SPACE_RE.sub(" ", text).strip()

    # Accents : garder par d√©faut (meilleur pour FR/AR/ES). Sinon enlever.
    if not keep_accents:
        text = strip_accents(text)

    # minuscules (souvent inutile pour mod√®les uncased, mais utile si r√®gles)
    if to_lower:
        text = text.lower()

    return text

In [6]:
def has_emoji(text: str) -> int:
    return int(bool(EMOJI_RE.search(text)))

def safe_lang_detect(text: str) -> str:
    if detect is None or not text:
        return "unk"
    try:
        return detect(text)
    except Exception:
        return "unk"

def truncate_tokens(text: str, max_words: int = 320) -> str:
    """Troncature simple par mots (rapide et efficace pour scoring)."""
    toks = text.split()
    if len(toks) <= max_words:
        return text
    return " ".join(toks[:max_words])

In [7]:
def preprocess_reviews(df: pd.DataFrame,
                       title_col: str = "review title",
                       body_col: str = "review",
                       keep_accents: bool = True,
                       to_lower: bool = False,
                       detect_language: bool = True,
                       max_words: int = 1575) -> pd.DataFrame:
    """
    - Fusionne titre + avis en 'full_review_raw'
    - Nettoie ‚Üí 'full_review_clean'
    - (option) d√©tecte la langue ‚Üí 'lang'
    - Tronque √† max_words
    - Ajoute features: n_chars, n_words, has_emoji, has_exclaim, has_question
    """
    df = df.copy()

    # 1) Fusion titre + avis
    df[title_col] = df[title_col].fillna("").astype(str)
    df[body_col]  = df[body_col].fillna("").astype(str)
    df["full_review_raw"] = (df[title_col].str.strip() + ". " + df[body_col].str.strip()).str.strip()
    df["full_review_raw"] = df["full_review_raw"].str.replace(r"^\.\s*", "", regex=True)  # si titre vide

    # 2) Nettoyage
    df["full_review_clean"] = df["full_review_raw"].apply(
        lambda x: basic_clean(x, to_lower=to_lower, keep_accents=keep_accents)
    )

    # 3) D√©tection de langue (optionnel)
    if detect_language:
        df["lang"] = df["full_review_clean"].apply(safe_lang_detect)
    else:
        df["lang"] = "unk"

    # 4) Tronquer pour scoring (acc√©l√®re sans trop perdre)
    df["full_review_trunc"] = df["full_review_clean"].apply(lambda x: truncate_tokens(x, max_words=max_words))

    # 5) Features utiles pour l‚Äôanalyse
    df["n_chars"] = df["full_review_trunc"].str.len()
    df["n_words"] = df["full_review_trunc"].str.split().apply(len)
    df["has_emoji"] = df["full_review_raw"].apply(has_emoji)
    df["has_exclaim"] = df["full_review_raw"].str.contains("!").astype(int)
    df["has_question"] = df["full_review_raw"].str.contains(r"\?").astype(int)

    # 6) D√©dupliquer (m√™me h√¥tel + m√™me texte)
    key_cols = []
    for c in ["city", "category", "place name", "plateforme"]:
        if c in df.columns:
            key_cols.append(c)
    key_cols.append("full_review_clean")
    df = df.drop_duplicates(subset=key_cols, keep="first").reset_index(drop=True)

    return df

In [16]:
df=pd.read_csv("./data/Tripadvisor_reviews.csv")

In [8]:
df_pp = preprocess_reviews(df,
                           title_col="review title",
                           body_col="review",
                           keep_accents=True,   # garder les accents (recommand√© pour FR/ES)
                           to_lower=False,      # les mod√®les Transformers g√®rent la casse
                           detect_language=True,
                           max_words=1575)

In [26]:
data=df_pp[["city","category","place name","review title","review","plateforme","lang","full_review_trunc"]]
data.to_csv("data.csv",index=False)

In [14]:
URL_RE   = re.compile(r"https?://\S+|www\.\S+")
HTML_RE  = re.compile(r"<[^>]+>")
EMAIL_RE = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w{2,}\b")
PHONE_RE = re.compile(r"\b(?:\+?\d[\s\-\(\)]*){7,}\b")
MULTI_WS = re.compile(r"\s+")
PUNCT_RUNS = re.compile(r"([!?.,]){2,}")
EMOJI_RE = re.compile("["                               # blocs emojis
                      "\U0001F600-\U0001F64F"
                      "\U0001F300-\U0001F5FF"
                      "\U0001F680-\U0001F6FF"
                      "\U0001F1E0-\U0001F1FF"
                      "]+", flags=re.UNICODE)

In [15]:
def basic_clean(text: str) -> str:
    """Nettoyage safe pour mod√®les (conserve le sens)."""
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = HTML_RE.sub(" ", text)
    text = URL_RE.sub(" ", text)
    text = EMAIL_RE.sub(" ", text)
    text = PHONE_RE.sub(" ", text)
    text = text.replace("‚Äô","'").replace("‚Äú",'"').replace("‚Äù",'"').replace("¬´","\"").replace("¬ª","\"")
    text = EMOJI_RE.sub(" ", text)
    text = PUNCT_RUNS.sub(lambda m: m.group(1), text)
    text = MULTI_WS.sub(" ", text).strip()
    return text

In [16]:
def safe_lang_detect(text: str) -> str:
    try:
        return detect(text) if text else "unk"
    except Exception:
        return "unk"

In [17]:
def prepare_for_scoring(df: pd.DataFrame,
                        text_col: str = "review text",
                        title_col: str | None = None) -> pd.DataFrame:
    """
    Cr√©e:
      - full_review_clean : (title + text) nettoy√©
      - lang : code langue d√©tect√©
    """
    df = df.copy()
    # Fusion optionnelle titre + avis
    if title_col and title_col in df.columns:
        df["__raw__"] = (df[title_col].fillna("").astype(str).str.strip() + " "
                         + df[text_col].fillna("").astype(str).str.strip()).str.strip()
    else:
        df["__raw__"] = df[text_col].fillna("").astype(str).str.strip()

    df["full_review_clean"] = df["__raw__"].apply(basic_clean)
    df["lang"] = df["full_review_clean"].apply(safe_lang_detect)
    df = df.drop(columns=["__raw__"])
    return df

In [11]:
df1=pd.read_csv("./data/Google_reviews.csv")

In [19]:
df_pp1= prepare_for_scoring(df1, text_col="review text", title_col=None)

In [25]:
def longue(text):
    text=str(text)
    return len(text.split(' '))

In [38]:
def stander_rat(text):
    text=str(text)
    r=text.split('/')[0]
    r=float(r)
    return r

In [5]:
df=pd.read_csv("./data/Booking_reviews.csv")
df.head(3)

Unnamed: 0,city,place name,name,note,date,review title,positive_text,negative_text,plateforme
0,Marrakech,Riad L'EncensOriental\nUne nouvelle fen√™tre va...,Estelle,5.0/5,21/08/2025,Exceptionnel,Ce nouveau riad d√®s qu' on rentre on se sent...,Tout etait bien,Booking
1,Marrakech,Riad L'EncensOriental\nUne nouvelle fen√™tre va...,Catherine,5.0/5,21/08/2025,"On revient vite, c √©tait top üòÉ . Merciüôè",Un lieu authentique pour s'immerger dans l'am...,,Booking
2,Marrakech,Riad L'EncensOriental\nUne nouvelle fen√™tre va...,Estelle,5.0/5,21/08/2025,Exceptionnel,"Cet havre de paix , \n.Un super accueil , ...",Tout √©tait super quartier kasbah que j apprec...,Booking


In [6]:
df.isnull().sum()

city                0
place name          0
name                0
note                0
date              127
review title        1
positive_text     154
negative_text    1812
plateforme          0
dtype: int64

In [10]:
# pip install pandas langdetect pyarrow

# ===== Regex utilitaires =====
URL_RE   = re.compile(r"https?://\S+|www\.\S+")
HTML_RE  = re.compile(r"<[^>]+>")
EMAIL_RE = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w{2,}\b")
PHONE_RE = re.compile(r"\b(?:\+?\d[\s\-\(\)]*){7,}\b")
MULTI_WS = re.compile(r"\s+")
PUNCT_RUNS = re.compile(r"([!?.,]){2,}")
EMOJI_RE = re.compile("[" "\U0001F600-\U0001F64F" "\U0001F300-\U0001F5FF"
                      "\U0001F680-\U0001F6FF" "\U0001F1E0-\U0001F1FF" "]+")
NOTE_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*/\s*5")

def build_raw_text(row,
                   title_col="review title",
                   pos_col="positive_text",
                   neg_col="negative_text"):
    parts = []
    t = str(row.get(title_col, "") or "").strip()
    p = str(row.get(pos_col, "") or "").strip()
    n = str(row.get(neg_col, "") or "").strip()
    if t: parts.append(t)
    if p: parts.append(f"Positive: {p}")
    if n: parts.append(f"Negative: {n}")
    return " | ".join(parts).replace("\\n", " ").strip()

def basic_clean(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = text.replace("\n", " ")
    text = HTML_RE.sub(" ", text)
    text = URL_RE.sub(" ", text)
    text = EMAIL_RE.sub(" ", text)
    text = PHONE_RE.sub(" ", text)
    text = (text.replace("‚Äô","'")
                .replace("‚Äú",'"').replace("‚Äù",'"')
                .replace("¬´","\"").replace("¬ª","\""))
    text = EMOJI_RE.sub(" ", text)
    text = PUNCT_RUNS.sub(lambda m: m.group(1), text)
    text = MULTI_WS.sub(" ", text).strip()
    return text

def safe_lang_detect(text: str) -> str:
    try:
        return detect(text) if text else "unk"
    except Exception:
        return "unk"

def truncate_words(text: str, max_words: int = 320) -> str:
    toks = text.split()
    return text if len(toks) <= max_words else " ".join(toks[:max_words])

def parse_rating_1to5(x):
    if pd.isna(x): return pd.NA
    s = str(x).strip()
    m = NOTE_RE.search(s)
    if m: s = m.group(1)
    s = s.replace(",", ".")
    try:
        v = float(s)
    except:
        return pd.NA
    if 0.0 <= v <= 5.0: return v
    if 0.0 <= v <= 1.0: return v * 5.0
    return pd.NA

def preprocess_booking_like(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 1) Construire le texte brut (titre + positif + n√©gatif)
    df["full_review_raw"] = df.apply(build_raw_text, axis=1)

    # 2) Nettoyage + troncature pour mod√®le
    df["full_review_clean"] = df["full_review_raw"].apply(basic_clean)
    df["full_review_trunc"] = df["full_review_clean"].apply(lambda x: truncate_words(x, 320))

    # 3) Langue
    df["lang"] = df["full_review_clean"].apply(safe_lang_detect)

    # 4) Note "x/5" -> float 1..5
    if "note" in df.columns:
        df["rating_1to5"] = df["note"].apply(parse_rating_1to5)
    else:
        df["rating_1to5"] = pd.NA

    # 5) Date -> YYYY-MM-DD

    # 6) D√©duplication (m√™me lieu + m√™me texte)
    key_cols = [c for c in ["city","place name","plateforme"] if c in df.columns]
    key_cols.append("full_review_clean")
    df = df.drop_duplicates(subset=key_cols, keep="first").reset_index(drop=True)

    # 7) Ordonner les colonnes utiles
    first = [c for c in ["city","place name","name","plateforme","date","date_std","note","rating_1to5"] if c in df.columns]
    last  = ["full_review_raw","full_review_clean","full_review_trunc","lang"]
    rest  = [c for c in df.columns if c not in set(first+last)]
    return df[first + rest + last]

# ---------------------
# Exemple d‚Äôutilisation
# ---------------------
# df = pd.read_csv("booking_reviews.csv")
# df_pp = preprocess_booking_like(df)
# df_pp.to_parquet("booking_preprocessed.parquet", index=False)
# df_pp[["full_review_trunc","lang","rating_1to5"]].head()


In [12]:
df_pp = preprocess_booking_like(df)


In [27]:
df_pp.head(3)

Unnamed: 0,city,place name,name,plateforme,date,note,rating_1to5,review title,positive_text,negative_text,full_review_raw,full_review_clean,full_review_trunc,lang
0,Marrakech,Riad L'EncensOriental\nUne nouvelle fen√™tre va...,Estelle,Booking,21/08/2025,5.0/5,5.0,Exceptionnel,Ce nouveau riad d√®s qu' on rentre on se sent...,Tout etait bien,Exceptionnel | Positive: Ce nouveau riad d√®s ...,Exceptionnel | Positive: Ce nouveau riad d√®s q...,Exceptionnel | Positive: Ce nouveau riad d√®s q...,fr
1,Marrakech,Riad L'EncensOriental\nUne nouvelle fen√™tre va...,Catherine,Booking,21/08/2025,5.0/5,5.0,"On revient vite, c √©tait top üòÉ . Merciüôè",Un lieu authentique pour s'immerger dans l'am...,,"On revient vite, c √©tait top üòÉ . Merciüôè | Posi...","On revient vite, c √©tait top . Merci | Positiv...","On revient vite, c √©tait top . Merci | Positiv...",fr
2,Marrakech,Riad L'EncensOriental\nUne nouvelle fen√™tre va...,Estelle,Booking,21/08/2025,5.0/5,5.0,Exceptionnel,"Cet havre de paix , \n.Un super accueil , ...",Tout √©tait super quartier kasbah que j apprec...,"Exceptionnel | Positive: Cet havre de paix , ...","Exceptionnel | Positive: Cet havre de paix , ....","Exceptionnel | Positive: Cet havre de paix , ....",fr


In [20]:
data=df_pp[["city","place name","date","note","full_review_trunc","lang","plateforme"]]

In [28]:
df_pp["test"]=df_pp["full_review_trunc"].apply(longue)

In [32]:
data.to_csv("Booking_reviews.csv",index=False)