In [1]:
url = 'https://beefast.coopcycle.org/fr/restaurants'

In [2]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin

def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
        return response.text
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin

def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
        return response.text
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def find_links(html_content, base_url):
    soup = BeautifulSoup(html_content, 'html.parser')
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True)]

def extract_json_ld(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    scripts = soup.find_all('script', type='application/ld+json')
    json_lds = []
    for script in scripts:
        try:
            json_ld = json.loads(script.string)
            json_lds.append(json_ld)
        except json.JSONDecodeError as e:
            print(f"JSON decoding failed: {e}")
    return json_lds

def main(url):
    html_content = fetch_and_parse_url(url)
    if html_content:
        links = find_links(html_content, url)
        for link in links:
            # Avoiding processing '#' links (which are just anchors)
            if link == url or link.endswith('#'):
                continue
            print(f"Processing {link}")
            link_html_content = fetch_and_parse_url(link)
            if link_html_content:
                json_lds = extract_json_ld(link_html_content)
                for json_ld in json_lds:
                    print(json.dumps(json_ld, indent=2))

if __name__ == "__main__":
    main(url)



Processing https://beefast.coopcycle.org/fr/
Processing https://beefast.coopcycle.org/an/restaurants
Processing https://beefast.coopcycle.org/ca/restaurants
Processing https://beefast.coopcycle.org/de/restaurants
Processing https://beefast.coopcycle.org/en/restaurants
Processing https://beefast.coopcycle.org/es/restaurants
Processing https://beefast.coopcycle.org/eu/restaurants
Processing https://beefast.coopcycle.org/it/restaurants
Processing https://beefast.coopcycle.org/pl/restaurants
Processing https://beefast.coopcycle.org/pt_BR/restaurants
Processing https://beefast.coopcycle.org/pt_PT/restaurants
Processing https://beefast.coopcycle.org/login
Processing https://beefast.coopcycle.org/fr/shops
Processing https://beefast.coopcycle.org/fr/restaurants/suggest
Processing https://beefast.coopcycle.org/fr/shops
Processing https://beefast.coopcycle.org/fr/restaurant/30-mealk
{
  "@context": "http://schema.org",
  "@id": "/api/restaurants/30",
  "@type": "http://schema.org/Restaurant",
  

## V2

In [18]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
from rdflib import Graph, Literal, URIRef, Namespace
from rdflib.namespace import RDF, RDFS, XSD

# Define namespaces
SCHEMA = Namespace("http://schema.org/")

def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
        return response.text
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def extract_restaurant_details(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    script = soup.find('script', type='application/ld+json')
    restaurant_details = {}
    if script:
        json_ld = json.loads(script.string)
        restaurant_details['name'] = json_ld.get('name')
        address = json_ld.get('address')
        if address:
            restaurant_details['address'] = address.get('streetAddress')
        opening_hours = json_ld.get('openingHoursSpecification')
        if opening_hours:
            opening_hours_text = ', '.join([f"{oh['dayOfWeek'][0]}: {oh['opens']}-{oh['closes']}" for oh in opening_hours])
            restaurant_details['opening_hours'] = opening_hours_text
        restaurant_details['image'] = json_ld.get('image')
        restaurant_details['description'] = json_ld.get('description')
        restaurant_details['delivery_url'] = json_ld.get('potentialAction', {}).get('target', {}).get('urlTemplate')
    print(restaurant_details)
    return restaurant_details

def convert_to_ttl(restaurant_details):
    g = Graph()
    restaurant_uri = URIRef("http://example.org/restaurant")
    for key, value in restaurant_details.items():
        if value:
            if key == 'image' or key == 'delivery_url':
                g.add((restaurant_uri, SCHEMA[key], URIRef(value)))
            else:
                g.add((restaurant_uri, SCHEMA[key], Literal(value)))
    return g.serialize(format='turtle')

def main():
    with open('coopcycle.json', 'r') as f:
        data = json.load(f)
        for item in data:
            coopcycle_url = item.get('coopcycle_url')
            if coopcycle_url:
                url = urljoin(coopcycle_url, '/fr/shops?type=restaurant')
                html_content = fetch_and_parse_url(url)
                if html_content:
                    soup = BeautifulSoup(html_content, 'html.parser')
                    restaurant_items = soup.find_all('div', class_='restaurant-item')
                    for restaurant_item in restaurant_items:
                        restaurant_link = restaurant_item.find('a')['href']
                        restaurant_url = urljoin(coopcycle_url, restaurant_link)
                        restaurant_html_content = fetch_and_parse_url(restaurant_url)
                        if restaurant_html_content:
                            restaurant_details = extract_restaurant_details(restaurant_html_content)
                            ttl_data = convert_to_ttl(restaurant_details)
                            with open('output.ttl', 'a',encoding='utf-8') as output_file:
                                output_file.write(ttl_data)

if __name__ == "__main__":
    main()


{'name': 'Du Pain Pour Demain (en précommande à J+1)', 'address': '31 Rue de Bruges, 21000 Dijon, France', 'opening_hours': 'Monday: 06:45-10:00', 'image': 'https://a2roo.coopcycle.org/media/cache/restaurant_thumbnail/60/c7/60c71551bdba3.jpg', 'description': "Boulangerie-pâtisserie aux confins des quartiers Toison d'Or et Charmette, Du Pain pour Demain vous propose une offre exclusive pour vos petits déjeuners au bureau ou à la maison. \r\n\r\nDes viennoiseries hautes en couleurs fabriquées avec passion et des matières premières exceptionnelles. \r\n\r\nBlés issus de la filière biologique et Label Rouge, dialogue direct avec les producteurs et originalité, c'est la recette de la Meilleure Boulangerie de France !", 'delivery_url': 'https://a2roo.coopcycle.org/fr/restaurant/34-du-pain-pour-demain-en-precommande-a-j-1'}
{'name': 'La Menuiserie', 'address': '28 Rue des Godrans, Dijon, France', 'opening_hours': 'Tuesday: 09:00-14:00', 'image': 'https://a2roo.coopcycle.org/media/cache/restau

KeyboardInterrupt: 

In [4]:
import requests
import rdflib
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
from rdflib import Graph, Literal, URIRef, Namespace
from rdflib.namespace import RDF, RDFS, XSD

# Define namespaces
SCHEMA = Namespace("http://schema.org/")

def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the response status code is 4XX/5XX
        return response.text
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def extract_restaurant_details(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    script = soup.find('script', type='application/ld+json')
    restaurant_details = {}
    if script:
        json_ld = json.loads(script.string)
        restaurant_details['name'] = json_ld.get('name')
        address = json_ld.get('address')
        if address:
            restaurant_details['address'] = address.get('streetAddress')
            restaurant_details['latitude'] = address.get('geo', {}).get('latitude')
            restaurant_details['longitude'] = address.get('geo', {}).get('longitude')
        opening_hours = json_ld.get('openingHoursSpecification')
        if opening_hours:
            opening_hours_text = ', '.join([f"{oh['dayOfWeek'][0]}: {oh['opens']}-{oh['closes']}" for oh in opening_hours])
            restaurant_details['opening_hours'] = opening_hours_text
        restaurant_details['image'] = json_ld.get('image')
        restaurant_details['banner_image'] = json_ld.get('bannerImage')
        restaurant_details['description'] = json_ld.get('description')
        restaurant_details['delivery_url'] = json_ld.get('potentialAction', {}).get('target', {}).get('urlTemplate')
    return restaurant_details

def main():
    g = Graph()
    with open('coopcycle.json', 'r') as f:
        data = json.load(f)
        for i, item in enumerate(data):
            print(f"Processing item {i}")
            if i == 5:
                break
            coopcycle_url = item.get('coopcycle_url')
            if coopcycle_url:
                url = urljoin(coopcycle_url, '/fr/shops?type=restaurant')
                html_content = fetch_and_parse_url(url)
                if html_content:
                    soup = BeautifulSoup(html_content, 'html.parser')
                    restaurant_items = soup.find_all('div', class_='restaurant-item')
                    for restaurant_item in restaurant_items:
                        restaurant_link = restaurant_item.find('a')['href']
                        restaurant_url = urljoin(coopcycle_url, restaurant_link)
                        restaurant_html_content = fetch_and_parse_url(restaurant_url)
                        if restaurant_html_content:
                            restaurant_details = extract_restaurant_details(restaurant_html_content)
                            restaurant_uri = URIRef(restaurant_url)
                            g.add((restaurant_uri, RDF.type, SCHEMA.Restaurant))  # Declare restaurant as instance of schema:Restaurant
                            for key, value in restaurant_details.items():
                                if value:
                                    if key == 'image' or key == 'banner_image' or key == 'delivery_url':
                                        g.add((restaurant_uri, SCHEMA[key], URIRef(value)))
                                    elif key == 'latitude' or key == 'longitude':
                                        g.add((restaurant_uri, SCHEMA[key], Literal(value, datatype=XSD.float)))
                                    else:
                                        g.add((restaurant_uri, SCHEMA[key], Literal(value)))
        # add data properties
        data_properties = [
            "name",
            "address",
            "latitude",
            "longitude",
            "opening_hours",
            "description",
            "delivery_url",
            "image",
            "banner_image"
            ]

        for prop in data_properties:
            g.add((rdflib.URIRef("http://schema.org/" + prop), rdflib.RDF.type, rdflib.OWL.DatatypeProperty)) 
    with open('restaurants.ttl', 'w', encoding='utf-8') as output_file:
        output_file.write(g.serialize(format='turtle'))

if __name__ == "__main__":
    main()


Processing item 0
Processing item 1
Processing item 2
Processing item 3
Processing item 4
Processing item 5
