In [12]:
url = 'https://beefast.coopcycle.org/fr/restaurants'

In [13]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin

def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
        return response.text
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def find_links(html_content, base_url):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = []
    # Find all elements with class "restaurant-item"
    restaurant_items = soup.find_all(class_="restaurant-item")
    for item in restaurant_items:
        # Within each "restaurant-item", find all <a> tags and extract their href
        item_links = [urljoin(base_url, a['href']) for a in item.find_all('a', href=True)]
        links.extend(item_links)
    return links

def extract_json_ld(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    scripts = soup.find_all('script', type='application/ld+json')
    json_lds = []
    for script in scripts:
        try:
            json_ld = json.loads(script.string)
            json_lds.append(json_ld)
        except json.JSONDecodeError as e:
            print(f"JSON decoding failed: {e}")
    return json_lds

def main(url):
    html_content = fetch_and_parse_url(url)
    if html_content:
        links = find_links(html_content, url)
        for link in links:
            # Avoiding processing '#' links (which are just anchors)
            if link == url or link.endswith('#'):
                continue
            print(f"Processing {link}")
            link_html_content = fetch_and_parse_url(link)
            if link_html_content:
                json_lds = extract_json_ld(link_html_content)
                for json_ld in json_lds:
                    print(json.dumps(json_ld, indent=2))

if __name__ == "__main__":
    main(url)


Processing https://beefast.coopcycle.org/fr/restaurant/30-mealk
{
  "@context": "http://schema.org",
  "@id": "/api/restaurants/30",
  "@type": "http://schema.org/Restaurant",
  "name": "Mealk",
  "address": {
    "@id": "/api/addresses/2920",
    "@type": "http://schema.org/PostalAddress",
    "geo": {
      "@type": "GeoCoordinates",
      "latitude": 49.875537,
      "longitude": 2.277711
    },
    "streetAddress": "14 Avenue Paul Claudel, 80000 Amiens, France",
    "telephone": "+33322724071",
    "name": "Mealk"
  },
  "openingHoursSpecification": [
    {
      "@type": "OpeningHoursSpecification",
      "opens": "10:30",
      "closes": "14:45",
      "dayOfWeek": [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday"
      ]
    },
    {
      "@type": "OpeningHoursSpecification",
      "opens": "18:00",
      "closes": "19:00",
      "dayOfWeek": [
        "Monday",
        "Tuesday",
        "Wednesday",
        "T

In [14]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
from rdflib import Graph

def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def find_links(html_content, base_url):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = []
    restaurant_items = soup.find_all(class_="restaurant-item")
    for item in restaurant_items:
        item_links = [urljoin(base_url, a['href']) for a in item.find_all('a', href=True)]
        links.extend(item_links)
    return links

def extract_json_ld(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    scripts = soup.find_all('script', type='application/ld+json')
    json_lds = []
    for script in scripts:
        try:
            json_ld = json.loads(script.string)
            json_lds.append(json_ld)
        except json.JSONDecodeError as e:
            print(f"JSON decoding failed: {e}")
    return json_lds

def save_json_ld_to_ttl(json_lds, file_name):
    g = Graph()
    for json_ld in json_lds:
        g.parse(data=json.dumps(json_ld), format='json-ld')
    g.serialize(destination=file_name, format='turtle')

def main(url):
    all_json_lds = []
    html_content = fetch_and_parse_url(url)
    if html_content:
        links = find_links(html_content, url)
        for link in links:
            if link == url or link.endswith('#'):
                continue
            link_html_content = fetch_and_parse_url(link)
            if link_html_content:
                json_lds = extract_json_ld(link_html_content)
                all_json_lds.extend(json_lds)
    if all_json_lds:
        save_json_ld_to_ttl(all_json_lds, "restaurant.ttl")

if __name__ == "__main__":
    main(url)


In [20]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
from rdflib import Graph

def fetch_and_parse_url(url):
    try:
        print(f"Processing URL: {url}")  # Print the URL being processed
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def find_links_with_class(html_content, base_url, class_name):
    soup = BeautifulSoup(html_content, 'html.parser')
    elements = soup.find_all(class_=class_name)
    links = []
    for element in elements:
        if element.name == 'a':
            links.append(urljoin(base_url, element['href']))
        else:
            item_links = [urljoin(base_url, a['href']) for a in element.find_all('a', href=True)]
            links.extend(item_links)
    return links

def extract_json_ld(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    scripts = soup.find_all('script', type='application/ld+json')
    json_lds = []
    for script in scripts:
        try:
            json_ld = json.loads(script.string)
            json_lds.append(json_ld)
        except json.JSONDecodeError as e:
            print(f"JSON decoding failed: {e}")
    return json_lds

def save_json_ld_to_ttl(json_lds, file_name):
    g = Graph()
    for json_ld in json_lds:
        g.parse(data=json.dumps(json_ld), format='json-ld')
    g.serialize(destination=file_name, format='turtle')
    print(f"Saved TTL data to {file_name}")

def main(url):
    all_json_lds = []
    html_content = fetch_and_parse_url(url)
    if html_content:
        restaurant_links = find_links_with_class(html_content, url, "restaurant-item")
        for restaurant_link in restaurant_links:
            restaurant_html_content = fetch_and_parse_url(restaurant_link)
            if restaurant_html_content:
                menu_item_links = find_links_with_class(restaurant_html_content, restaurant_link, "menu-item")
                for menu_item_link in menu_item_links:
                    menu_html_content = fetch_and_parse_url(menu_item_link)
                    if menu_html_content:
                        json_lds = extract_json_ld(menu_html_content)
                        all_json_lds.extend(json_lds)
    if all_json_lds:
        save_json_ld_to_ttl(all_json_lds, "menu.ttl")

if __name__ == "__main__":
    main(url)


Processing URL: https://beefast.coopcycle.org/fr/restaurants
Processing URL: https://beefast.coopcycle.org/fr/restaurant/30-mealk
Processing URL: https://beefast.coopcycle.org/fr/restaurant/22-moana-poke
Processing URL: https://beefast.coopcycle.org/fr/restaurant/28-la-mamma-mia
Processing URL: https://beefast.coopcycle.org/fr/restaurant/23-la-manufacture
Processing URL: https://beefast.coopcycle.org/fr/restaurant/31-via-pizza
Processing URL: https://beefast.coopcycle.org/fr/restaurant/21-robin-room
Processing URL: https://beefast.coopcycle.org/fr/restaurant/34-chez-rosa
Processing URL: https://beefast.coopcycle.org/fr/restaurant/35-le-teuf
Processing URL: https://beefast.coopcycle.org/fr/restaurant/13-l-adresse
