## Importations

In [44]:
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import os
import torch

## Dataset Preprocessing

In [45]:
# Load the datasets

df_review = pd.read_csv('data/reviews83325.csv')
df_place = pd.read_csv('data/Tripadvisor.csv')
df_subcategorie = pd.read_csv('data/AttractionSubCategorie.csv')
df_subtype = pd.read_csv('data/AttractionSubType.csv')
df_cuisine = pd.read_csv('data/cuisine.csv')
df_restaurant_type = pd.read_csv('data/restaurantType.csv')

# Preprocess the data - Merging Reviews and Places
if not os.path.exists('data/review_place.csv'):

    # Select only english reviews
    df_review = df_review[df_review['langue'] == 'en'].copy()

    # Select only the first 100 reviews for each place to limit the dataset size
    df_review_100 = df_review.groupby('idplace').head(100)

    cols_eval = [
        'id', 
        'nom', 
        'typeR', # Level 1 Evaluation
        'restaurantTypeCuisine', # Level 2 Evaluation (Restaurants)
        'activiteSubCategorie', # Level 2 Evaluation (Attractions)
        'restaurantType',
        'activiteType'
    ]

    df_place = pd.read_csv('data/Tripadvisor.csv', usecols=cols_eval)
    df_review_place = pd.merge(df_review_100, df_place, left_on='idplace', right_on='id')
    # df_review_place = df_review_place.drop(columns=['id'])

    # Cleaning the merged dataset before saving it
    df_review_place = df_review_place.dropna(subset=['review'])
    df_review_place.to_csv('data/review_place.csv', index=False)
else:
    df_review_place = pd.read_csv('data/review_place.csv')

  df_review = pd.read_csv('data/reviews83325.csv')


## Dataset split between database (user requests) and results

In [46]:
import numpy as np

# Get unique place IDs
unique_places = df_review_place['idplace'].unique()

# Shuffling places to ensure random distribution between train and test sets
np.random.seed(42)
np.random.shuffle(unique_places)

# Divide the places into 2 sets: 50% for training and 50% for testing
split_idx = len(unique_places) // 2
train_ids = unique_places[:split_idx]
test_ids = unique_places[split_idx:]

print(f"Total number of place : {len(unique_places)}")
print(f"Places in databse (Train) : {len(train_ids)}")
print(f"Places used as requests (Test) : {len(test_ids)}")

# DataFrame creation
df_train = df_review_place[df_review_place['idplace'].isin(train_ids)].copy()
df_test = df_review_place[df_review_place['idplace'].isin(test_ids)].copy()

# Aggregation of reviews by place
train_grouped = df_train.groupby('idplace').agg({
    'review': lambda x: " ".join(x.astype(str)),
    'typeR': 'first',
    'restaurantTypeCuisine': 'first',
    'activiteSubCategorie': 'first',
    'nom': 'first',
    'activiteType': 'first'
}).reset_index()

test_grouped = df_test.groupby('idplace').agg({
    'review': lambda x: " ".join(x.astype(str)),
    'typeR': 'first',
    'restaurantTypeCuisine': 'first',
    'activiteSubCategorie': 'first',
    'nom': 'first',
    'activiteType': 'first'
}).reset_index()

print("Aggregation completed.")

Total number of place : 1835
Places in databse (Train) : 917
Places used as requests (Test) : 918
Aggregation completed.


## Baseline Setup : BM25

In [47]:
# Tokenization and BM25 Implementation
corpus_tokenized = [doc.lower().split(" ") for doc in train_grouped['review'].tolist()]

# BM25 model initialization
bm25 = BM25Okapi(corpus_tokenized)

# Research function using BM25
def recherche_bm25(query_text, top_n=10):
    # Query tokenization
    tokenized_query = query_text.lower().split(" ")
    
    # Similarity scoring using BM25
    doc_scores = bm25.get_scores(tokenized_query)
    
    # Get the indices of the top N scores
    top_indices = np.argsort(doc_scores)[-top_n:][::-1]
    
    # Return the top N results with their scores
    results = []
    for idx in top_indices:
        results.append({
            'idplace': train_grouped.iloc[idx]['idplace'],
            'nom': train_grouped.iloc[idx]['nom'],
            'score': doc_scores[idx],
            'typeR': train_grouped.iloc[idx]['typeR'],
            'restaurantTypeCuisine': train_grouped.iloc[idx]['restaurantTypeCuisine'],
            'activiteType': train_grouped.iloc[idx]['activiteType']
        })
    return results

# Test example : we take the reviews of the first place in the TEST set
test_query = test_grouped.iloc[0]['review']
print(f"Search for places : {test_grouped.iloc[0]['nom']}\nQuest : {test_query[:100]}...")

resultats = recherche_bm25(test_query)
for r in resultats[:3]:
    print(f"-> Recommended : {r['nom']} (Score: {r['score']:.2f})")

Search for places : Place des Vosges
Quest : Personally I think it is the most beautiful square of Paris. Well maintained and the area around it ...
-> Recommended : Le Marais  (Score: 12304.89)
-> Recommended : Maison de Victor Hugo (Score: 12203.08)
-> Recommended : Carette (Score: 11824.91)


## SBERT Encoding + Cosine Similarity

In [48]:
# Model loading
# 'all-MiniLM-L6-v2' is fast and very effective for similarity tasks
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') 

# Embedding creation for the search database (Train)
print("Database encoding (Train)...")
train_sentences = train_grouped['review'].tolist()
train_embeddings = model.encode(train_sentences, show_progress_bar=True, convert_to_tensor=True)

# SBERT recommendation function
def recommander_sbert(query_text, top_n=10):
    # Query encoding (the reviews of the test place)
    query_embedding = model.encode([query_text], convert_to_tensor=True)
    
    # Cosine similarity is computed on CPU for compatibility with sklearn
    cos_scores = cosine_similarity(query_embedding.cpu(), train_embeddings.cpu())[0]
    
    # Get the indices of the top N scores
    top_indices = np.argsort(cos_scores)[-top_n:][::-1]
    
    results = []
    for idx in top_indices:
        results.append({
            'idplace': train_grouped.iloc[idx]['idplace'],
            'nom': train_grouped.iloc[idx]['nom'],
            'score': cos_scores[idx],
            'typeR': train_grouped.iloc[idx]['typeR'], # Useful for evaluation
            'restaurantTypeCuisine': train_grouped.iloc[idx]['restaurantTypeCuisine'],
            'activiteType': train_grouped.iloc[idx]['activiteType']
        })
    return results

# Test rapide
test_query = test_grouped.iloc[0]['review']
print(f"\Query for : {test_grouped.iloc[0]['nom']}")
recommandations = recommander_sbert(test_query, top_n=3)

for i, res in enumerate(recommandations):
    print(f"{i+1}. {res['nom']} - Score: {res['score']:.4f}")

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 653.31it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Database encoding (Train)...


Batches: 100%|██████████| 29/29 [00:14<00:00,  1.97it/s]

\Query for : Place des Vosges
1. Cathédrale Notre-Dame de Paris - Score: 0.7167
2. Maison de Victor Hugo - Score: 0.7025
3. Hotel de Nice - Score: 0.6735





## Evaluation Metrics

In [49]:
def calculate_ranking_error(test_row, recommendations, level=1):
    """
    Compute the ranking error for a single test place based on the first relevant match in the recommendations.
    level 1: matching on typeR
    level 2: matching on cuisine ou activiteSubCategorie
    """
    for rank, rec in enumerate(recommendations):
        match = False
        
        if level == 1:
            # Simple matching on the type (H, R, A, AP)
            if rec['typeR'] == test_row['typeR']:
                match = True
        
        elif level == 2:
            print("Rec", rec)
            # Matching on subCategorie (Cuisine or Activity)
            query_cat = str(test_row['restaurantTypeCuisine']) + " " + str(test_row['activiteSubCategorie'])
            rec_cat = str(rec['restaurantTypeCuisine']) + " " + str(rec['activiteType'])
            
            # Cleaning and splitting into sets for comparison
            q_set = set(query_cat.lower().replace(',', ' ').split())
            r_set = set(rec_cat.lower().replace(',', ' ').split())
            
            if not q_set.isdisjoint(r_set) and 'nan' not in q_set:
                match = True
                
        if match:
            return rank  # Return the rank of the first relevant match
            
    return len(recommendations) # If no match found in the top N, we consider the error as N (worst case)

# --- BOUCLE D'ÉVALUATION GLOBALE ---

erreurs_bm25_l1 = []
erreurs_sbert_l1 = []

print("Evaluating on sample...")

# We are testing on a sample to go faster, or the whole set
sample_test = test_grouped.sample(min(100, len(test_grouped))) 

for _, row in sample_test.iterrows():
    # Get recommendations for the current test place
    recs_bm25 = recherche_bm25(row['review'], top_n=50)
    # For SBERT, we reuse the typeR/cuisine columns added in the previous step
    recs_sbert = recommander_sbert(row['review'], top_n=50)
    
    # Compute Level 2 errors
    erreurs_bm25_l1.append(calculate_ranking_error(row, recs_bm25, level=2))
    erreurs_sbert_l1.append(calculate_ranking_error(row, recs_sbert, level=2))

print(f"\n--- RESULT LEVEL 2 ERROR ---")
print(f"Mean Error BM25 : {np.mean(erreurs_bm25_l1):.2f}")
print(f"Mean Error SBERT : {np.mean(erreurs_sbert_l1):.2f}")

Evaluating on sample...
Rec {'idplace': np.int64(1724980), 'nom': 'Suan Thai', 'score': np.float64(699.2299283358575), 'typeR': 'R', 'restaurantTypeCuisine': '10659,10660,10665,10697', 'activiteType': None}
Rec {'idplace': np.int64(718063), 'nom': 'Isami Zushi', 'score': np.float32(0.6827014), 'typeR': 'R', 'restaurantTypeCuisine': '5473,10659,10653', 'activiteType': None}
Rec {'idplace': np.int64(11482945), 'nom': 'Accès prioritaire au sommet de la tour Eiffel et hôte', 'score': np.float64(438.26307904674644), 'typeR': 'AP', 'restaurantTypeCuisine': None, 'activiteType': 'attractionproduct'}
Rec {'idplace': np.int64(12467083), 'nom': 'Billet coupe-file\xa0: excursion à la tour Eiffel et accès au sommet par l’ascenseur ', 'score': np.float32(0.6855728), 'typeR': 'AP', 'restaurantTypeCuisine': None, 'activiteType': 'attractionproduct'}
Rec {'idplace': np.int64(11473941), 'nom': 'Tour Eiffel, Louvre, Notre Dame: Paris à vélo en petit groupe', 'score': np.float64(1315.1615847452838), 'typ