# Import des librairies

In [None]:
import pandas as pd
import requests
import json
import urllib.parse
from bs4 import BeautifulSoup
import time
import random

# Chargement du top 5 des villes

In [None]:
#Charger les villes
df_top_city = pd.read_csv('../city_data/df_top_5_city.csv')
cities = df_top_city['city'].tolist()

# Récupérer le code html pour les 5 villes

In [41]:
#Initialiser un user-agent pour les requêtes HTTP
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Cache-Control": "max-age=0"
}

In [None]:
#Scraper chaque ville
for city in cities:

    #Construire l'URL
    encoded_city = urllib.parse.quote_plus(city + ", France")
    url = f"https://www.booking.com/searchresults.html?ss={encoded_city}&order=bayesian_review_score&nflt=ht_id%3D204"

    #Récupérer la page
    response = requests.get(url, headers=headers)

    #Sauvegarder le HTML
    filename = f"booking_{city.replace(' ', '_').lower()}.html"
    with open("html_data/" + filename, 'w', encoding='utf-8') as file:
        file.write(response.text)

    print(f"HTML sauvegardé: {filename}")
    print(f"Status de la requête: {response.status_code}")
    
    #Pause entre chaque ville
    time.sleep(random.uniform(10, 15))

# Extraire les informations du code html

## Fonction

In [3]:
def extract_hotel_info(hotel):
    """
    Extrait les informations d'un hôtel.

    Retourne un dictionnaire avec le nom, l'url, la description et la note.
    """

    info = {
        "name": None,
        "url": None,
        "description": None,
        "rating": None
    }

    #Nom de l'hôtel
    name = hotel.find('div', {"data-testid": "title"})
    if name:
        info["name"] = name.get_text(strip=True)

    #URL de l'hôtel
    link = hotel.find('a', {"data-testid": "title-link"})
    if link and link.get('href'):
        info['url'] = link['href']

    #Note de l'hôtel
    rating = hotel.find('div', {"data-testid": "review-score"})
    if rating:
        rating_div = rating.find("div", class_='dff2e52086')
        if not rating_div:
            #Autre possibilité de classe
            rating_div = rating.find('div', class_='f63b14ab7a dff2e52086')
        if rating_div:
            info['rating'] = rating_div.get_text(strip=True)

    #Description de l'hôtel
    #La description est dans une div avec la classe 'fff1944c52'
    potential_desc = hotel.find_all('div', class_='fff1944c52')
    for desc in potential_desc:
        text = desc.get_text(strip=True)
        #La description est un texte long qui ne contient pas "Indiquer sur la carte"
        if len(text) > 50 and "Indiquer sur la carte" not in text:
            info['description'] = text
            break

    return info

## Extraction

In [None]:
#Liste pour stocker les hotels
all_hotels = []

#Parcourir chaque ville
for city in cities:
    filename = f"booking_{city.replace(' ', '_').lower()}.html"
    with open("html_data/" + filename, 'r', encoding='utf-8') as file:
        html_content = file.read()

    #Parser le HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    #Trouver les hotels
    hotels = soup.find_all('div', {"data-testid": "property-card"})

    print(f"\n\n\nNombre d'hôtels trouvés pour {city}: {len(hotels)}")

    #Extraire les informations des 5 premiers hotels
    for i, hotel in enumerate(hotels[:5]):
        hotel_info = extract_hotel_info(hotel)
        hotel_info['city'] = city
        all_hotels.append(hotel_info)
        print(f"Hôtel {i+1}: {hotel_info['name']} - Note: {hotel_info['rating']}")

#Créer un dataframe avec tous les hôtels
df_hotels = pd.DataFrame(all_hotels)

#Réorganiser les colonnes
colonnes = ["city", "name", "url", "description", "rating"]
df_hotels = df_hotels[colonnes]

#Sauvegarder en csv
df_hotels.to_csv('../hotel_data/booking_hotels.csv', index=False, encoding='utf-8')

# Combiner les datasets

In [None]:
#Informations sur toutes les villes
df_hotel_meteo = pd.read_csv("../city_data/df_city.csv", index_col=0)

#Garder uniquement les villes où il fait le plus beau
df_hotel_meteo = df_hotel_meteo.head()

#Ajouter les informations météo aux hôtels
df_hotel_meteo = df_hotels.merge(df_hotel_meteo, on='city', how='left')

#Enregistrer le nouveau dataframe
df_hotel_meteo.to_csv('../hotel_data/booking_hotels_meteo.csv', index=False, encoding='utf-8')