In [4]:
import os
import requests
import json
import csv
from datetime import datetime, timedelta
import time
from random import uniform, randint
from urllib.parse import urlparse, parse_qs
from pathlib import Path

QUERY_ORDER = [
    "aid",
    "label",
    "sid",
    "age",
    "checkin",
    "checkout",
    "dest_id",
    "dest_type",
    "dist",
    "group_children",
    "hapos",
    "hpos",
    "no_rooms",
    "req_adults",
    "req_age",
    "req_children",
    "room1",
    "sb_price_type",
    "soh",
    "sr_order",
    "srepoch",
    "srpvid",
    "type",
    "ucfs"
]

DESTINATION_PATH = os.environ.get('DESTINATION_PATH')

In [9]:
def init_url_madrid(method="get") -> dict:
    api_url = os.getenv("API_URL_MADRID") #URL API DEV , IL FAUT CHANGER CHAND LE .ENV SI ONN VEUT GETTER DU PROD
    api_token = os.getenv("API_URL_MADRID_TOKEN") #TOKEN API DEV , IL FAUT CHANGER CHAND LE .ENV SI ONN VEUT GETTER DU PROD
    # print(api_token)
    headers = {'Accept': 'application/json', 'Authorization': f"Bearer {api_token}", "Content-Type": "application/json"}

    response = getattr(requests, method.lower())(api_url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        raise Exception(f"API request failed with status code {response.status_code}: {response.text}")


In [10]:
def save_to_json(data: dict, filename: str) -> None:
    route = "stations/"
    with open(f"{route}{filename}.json", 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [5]:
def open_json(filename: str = "montreal_urls") -> dict:
    route = "stations/"
    with open(f"{route}{filename}.json", 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [None]:
data = init_url_madrid()

# print(data)

save_to_json(data, "montreal_urls")



In [6]:
madrid_url_structure = open_json()

In [7]:
def get_page_type(url:str) -> str:
        if '/hotel/' in url:
            return 'hotel'
        else:
            return 'list'

In [8]:
# start_date = datetime.strptime(start_date, "%d/%m/%Y")
# end_date = datetime.strptime(end_date, "%d/%m/%Y")
#pour usage provisoire d'initialisation alohaamzao
start_date = datetime.strptime("12/01/2026", "%d/%m/%Y") #date de debut de scraping, LUNDI foana ity
end_date = datetime.strptime("31/05/2026", "%d/%m/%Y") #date de fin de scraping, 27 10 2025 -> 11 11 2025 no lasa any am .11 mandeha ao
freq = 1 #Nicolas a dit qu'on ne scrap que la fréquence de réservation à 1 jour

def normalize_url_params(url:str, start:str, end:str) -> str:
        """ normalize url parameters as needed for data scraping format """
        print(url)

        #24 10 2025 : vaut mieux nettoyer car parfois il y a des ?? dans les url
        url = url.replace("??",'')
        if not url.endswith('?'):
             url += '?'
        # print(f'url vrai = > {url}')
        
        url_params = parse_qs(urlparse(url).query)
        if "checkin" not in url_params:
            url += f"&checkin={start}"
        if "checkout" not in url_params:
            url += f"&checkout={end}"
        if "selected_currency" not in url_params:
            url += "&selected_currency=EUR"
        if "lang" not in url_params:
            #changé en es le 12 05 2025 car pour antequera c'est du es
            url += f"&lang=es"
        if "selected_currency" in url_params: #SI MONTREAL
            url += "&selected_currency=CAD"
        return url

In [9]:
def generate_url(stations_url:list) -> list:
        """generate dynamic urls for any station between interval of given dates {start_date and end_date}"""
        global QUERY_ORDER
        time.sleep(1)
        correct_dest_url = []
        if freq in [1, 3, 7]:
            date_space = int((end_date - start_date).days) + 1
            checkin = start_date
            checkout = checkin + timedelta(days=freq)  

            for _ in range(date_space):
                for name, details in stations_url.items():
                    page_type = get_page_type(details['url'])
                    url = normalize_url_params(details['url'], checkin.strftime("%Y-%m-%d"), checkout.strftime("%Y-%m-%d"))
                    if page_type == 'hotel':
                        base_url = url.split('?')[0]
                        params = url.split('?')[-1]
                        formated_ordered_params = ""
                        query_url = parse_qs(params)
                        for query in QUERY_ORDER:
                            if bool(query_url.get(query)):
                                formated_ordered_params += f"{query}={query_url.get(query, '')[0]}&"
                        parms_keys = list(query_url.keys())
                        new_params = [i for i in parms_keys if i not in QUERY_ORDER]
                        for query in new_params:
                            formated_ordered_params += f"{query}={query_url.get(query, '')[0]}&"
                        url = f"{base_url}?{formated_ordered_params}"[:-1]
                    #hoan'ilay type de fichier farany , car on a besoin de tous les details pour matcher à la fin mais aussi pour lier les id et les urls
                    correct_dest_url.append({
                         "name" : name,
                         "id": details['id'],
                         "url": url
                    })

                checkin += timedelta(days=1)
                checkout += timedelta(days=1)
            # input(f'ireto ireo retourné aloha => {correct_dest_url}')
            return correct_dest_url

In [10]:
#test de sauvegarde pour voir le rendu
def save_destination(data:list) -> None:
    """save destination urls in to json file"""
    print(" ==> saving destination")
    global DESTINATION_PATH
    folder_path = f"{DESTINATION_PATH}/{start_date.strftime('%d_%m_%Y')}"

    print(folder_path)
    dest_name = f"booking_dest{freq}_chamartin"
    dest_name = f"{folder_path}/{dest_name}.json"
    print(dest_name)
    if not Path(folder_path).exists():
        os.makedirs(folder_path)
    if not Path(dest_name).exists():
        with open(dest_name, "w") as openfile:
            openfile.write(json.dumps(data, indent=4))
    else:
        print(f"  ==> Destination with name {dest_name}.json already exist, do you want to overwrite this ? yes or no")
        response = input("  ==> your answer :")
        while response not in ['yes', 'no']:
            print(' ==> response unknown, please give correct answer!')
            print(f"  ==> Destination with name {dest_name}.json already exist, do you want to overwrite this ? yes or no")
            response = input("  ==> your answer :")
        match response:
            case 'yes':
                with open(dest_name, "w") as openfile:
                    openfile.write(json.dumps(data))
            case 'no':
                print(f'  ==> Destination {dest_name}.json kept')

    number_of_dest = len(json.load(open(dest_name)))

    print(f" ==> well done, {number_of_dest} destinations saved!")

In [11]:
url_vita_paramétrage = generate_url(madrid_url_structure)

#output
save_destination(url_vita_paramétrage)


https://www.booking.com/hotel/ca/montreal-1228-sherbrooke.html?selected_currency=CAD
https://www.booking.com/hotel/ca/four-seasons-montreal.html?selected_currency=CAD
https://www.booking.com/hotel/ca/fairmont-the-queen-elizabeth.html?selected_currency=CAD
https://www.booking.com/hotel/ca/hotel-bonaventure-montreal.fr.html?selected_currency=CAD
https://www.booking.com/hotel/ca/le-centre-sheraton-montreal-quebec.fr.html?selected_currency=CAD
https://www.booking.com/hotel/ca/monville.html?selected_currency=CAD
https://www.booking.com/hotel/ca/birks-montreal.html?selected_currency=CAD
https://www.booking.com/hotel/ca/le-mount-stephen.html?selected_currency=CAD
https://www.booking.com/hotel/ca/hotel-vogue-montreal-downtown.fr.html?selected_currency=CAD
https://www.booking.com/hotel/ca/montreal.fr.html?selected_currency=CAD
https://www.booking.com/hotel/ca/le-westin-montreal.html?selected_currency=CAD
https://www.booking.com/hotel/ca/w-montreal.html?selected_currency=CAD
https://www.booking.