In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
#The code below from this tutorial runs but doesn't produce any output
#https://www.geeksforgeeks.org/downloading-pdfs-with-python-using-requests-and-beautifulsoup/

# Le Loi pdf URL 
loi_url = "https://www.legifrance.gouv.fr/download/pdf?id=Sxg3EgwOTTiCEoslFw974wlgj8aUOv1MZCf1HPdWY3s="
 
# Requests URL and get response object
loi_response = requests.get(loi_url)
 
#Parse text obtained
#html.parser: Built-in parser in Python for parsing HTML.
#Purpose: Converts HTML content into a Beautiful Soup object.
soup = BeautifulSoup(loi_response.text, 'html.parser')
 
# Find all hyperlinks present on webpage
#'a' refers to the HTML <a> tags -  used to define hyperlinks. 
#The find_all method of Beautiful Soup is used to search the parse tree 
#and return all the instances of a specified tag
links = soup.find_all('a')
 
i = 0
 
# From all links check for pdf link and
# if present download file
for link in links:
    #This line checks if the string '.pdf' is present in the href attribute of the link tag.
    #link.get('href', []) tries to get the value of the href attribute.
    #If href does not exist, it returns an empty list [] to prevent errors.
    if ('.pdf' in link.get('href', [])):
        #i = i + 1
        i += 1
        print("Downloading file: ", i)
 
        # Get response object for link
        #This line sends an HTTP GET request to the URL 
        #specified in the href attribute of the link tag.
        response = requests.get(link.get('href'))
 
        # Write content in pdf file
        #These lines create a new file named pdfN.pdf, 
        #where N is the current value of i.
        #open("pdf"+str(i)+".pdf", 'wb') opens a file in write-binary mode ('wb'), 
        #meaning the file is opened for writing, and any existing content is cleared.
        #pdf.write(response.content) writes the content of the response (which is the PDF file) 
        #to the newly created file.
        #pdf.close() closes the file to ensure all data is properly written 
        #and resources are released
        pdf = open("pdf"+str(i)+".pdf", 'wb')
        pdf.write(response.content)
        pdf.close()
        print("File ", i, " downloaded")
 
print("All PDF files downloaded")

All PDF files downloaded


In [3]:
#Debugging above code

#Requests URL and get response object
#Check if the Request is Successful
loi_response = requests.get(loi_url)
print(loi_response.status_code)  # Should be 200 for a successful request

#200!

200


In [4]:
#Verify the HTML Content
print(loi_response.text)

#this looks correct
#<h1 class="pdf-title">Journal officiel électronique authentifié n° 0238 du 14/10/2014</h1>
#<object data="/download/file/Sxg3EgwOTTiCEoslFw974wlgj8aUOv1MZCf1HPdWY3s=/JOE_TEXTE">
#<p>Vous n'avez pas de plugin PDF mais vous pouvez 
#<a href="/download/file/Sxg3EgwOTTiCEoslFw974wlgj8aUOv1MZCf1HPdWY3s=/JOE_TEXTE">télécharger le fichier.
#</a></p></object></div>

<!DOCTYPE html><html lang="fr" class="no-js" dir="ltr"><head><title>Légifrance - Publications officielles - Journal officiel - JORF n° 0238 du 14/10/2014</title><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1"><meta name="description" content="Légifrance"><meta name="author" content=""><meta name="format-detection" content="telephone=no"><meta name="robots" content="noindex, nofollow"><meta name="_csrf" content="${_csrf.token}"/><meta name="_csrf_header" content="${_csrf.headerName}"/><link rel="Shortcut icon" type="image/x-icon" href="/resources/images/favicon.ico"><link rel="icon" sizes="16x16 32x32 48x48 64x64" href="/resources/images/favicon.ico"><link rel="apple-touch-icon" href="/resources/images/favicon-152.png"><!--  		Optional: IE10 Tile.  --><meta name="msapplication-TileColor" content="#FFFFFF"><meta name="msapplication-TileImage" content="/resources/images/favicon-144.png"><!-- 		O

In [5]:
#Check the Parsed HTML

#Purpose: Converts HTML content into a Beautiful Soup object
soup = BeautifulSoup(loi_response.text, 'html.parser')
print(soup.prettify()) # Pretty-print the HTML

<!DOCTYPE html>
<html class="no-js" dir="ltr" lang="fr">
 <head>
  <title>
   Légifrance - Publications officielles - Journal officiel - JORF n° 0238 du 14/10/2014
  </title>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="Légifrance" name="description"/>
  <meta content="" name="author"/>
  <meta content="telephone=no" name="format-detection"/>
  <meta content="noindex, nofollow" name="robots"/>
  <meta content="${_csrf.token}" name="_csrf">
   <meta content="${_csrf.headerName}" name="_csrf_header">
    <link href="/resources/images/favicon.ico" rel="Shortcut icon" type="image/x-icon"/>
    <link href="/resources/images/favicon.ico" rel="icon" sizes="16x16 32x32 48x48 64x64"/>
    <link href="/resources/images/favicon-152.png" rel="apple-touch-icon"/>
    <!--  		Optional: IE10 Tile.  -->
    <meta content="#FFFFFF" name="msapplication-TileColor"/>
    <meta con

In [6]:
#Verify the Links
links = soup.find_all('a')
print(len(links))  # Print the number of <a> tags found
#18

18


In [7]:
#Debug the Link Processing
#Print the href attributes of all links to see if any contain .pdf
for link in links:
    href = link.get('href', [])
    print(href)
    if '.pdf' in href:
        print('pdf link found:', href)
        
#pdf not found so trying another link
    

/
/download/file/Sxg3EgwOTTiCEoslFw974wlgj8aUOv1MZCf1HPdWY3s=/JOE_TEXTE
/
/contenu/menu/publications-officielles
/jorf/jo
/download/file/Sxg3EgwOTTiCEoslFw974wlgj8aUOv1MZCf1HPdWY3s=/JOE_TEXTE
https://sgmap.sphinxdeclic.com/d/s/j9c6ad
/contenu/pied-de-page/a-propos-de-cette-version
/contenu/pied-de-page/mentions-legales
/contenu/pied-de-page/politique-de-confidentialite
/contenu/pied-de-page/plan-du-site
/contenu/pied-de-page/open-data-et-api
/contenu/pied-de-page/accessibilite-partiellement-conforme
https://www.service-public.fr/
https://www.vie-publique.fr/
https://www.data.gouv.fr/fr/
https://code.travail.gouv.fr/
https://www.info.gouv.fr/


In [8]:
# Le Loi pdf URL 
loi_url_2 = "https://www.legifrance.gouv.fr/jorf/id/JORFTEXT000030024742"

#Requests URL and get response object
#Check if the Request is Successful
loi_response2 = requests.get(loi_url_2)
print(loi_response2.status_code)  # Should be 200 for a successful request

#200!

200


In [9]:
#Verify the HTML Content
print(loi_response2.text)

<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" xmlns:eli="http://data.europa.eu/eli/ontology#"><html lang="fr" class="no-js" dir="ltr"><head><title>LOI n° 2014-1170 du 13 octobre 2014 d&#39;avenir pour l&#39;agriculture, l&#39;alimentation et la forêt (rectificatif) - Légifrance</title><meta charset="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><meta name="viewport" content="width=device-width, initial-scale=1"/><meta name="description" content="LOI n° 2014-1170 du 13 octobre 2014 d&#39;avenir pour l&#39;agriculture, l&#39;alimentation et la forêt (rectificatif)"/><meta name="author" content=""/><meta name="format-detection" content="telephone=no"/><meta name="_csrf" content="700c0aa2-3547-45be-a538-e34c5caf4930"/><meta name="_csrf_header" content="X-CSRF-TOKEN"/><meta name="robots" content="index, follow"><link rel="Shortcut icon" type="image/x-icon" href="/resources/images/favicon.ico"/><link rel="icon" sizes="16x16 32x32 48x48 64x64" href="/resources/imag

In [15]:
#Check the Parsed HTML

#Purpose: Converts HTML content into a Beautiful Soup object
soup = BeautifulSoup(loi_response2.text, 'html.parser')
print(soup.prettify()) # Pretty-print the HTML

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:eli="http://data.europa.eu/eli/ontology#">
 <html class="no-js" dir="ltr" lang="fr">
  <head>
   <title>
    LOI n° 2014-1170 du 13 octobre 2014 d'avenir pour l'agriculture, l'alimentation et la forêt (rectificatif) - Légifrance
   </title>
   <meta charset="utf-8"/>
   <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
   <meta content="width=device-width, initial-scale=1" name="viewport"/>
   <meta content="LOI n° 2014-1170 du 13 octobre 2014 d'avenir pour l'agriculture, l'alimentation et la forêt (rectificatif)" name="description"/>
   <meta content="" name="author"/>
   <meta content="telephone=no" name="format-detection"/>
   <meta content="99034ba5-c800-456d-8ba2-629286efacf2" name="_csrf"/>
   <meta content="X-CSRF-TOKEN" name="_csrf_header"/>
   <meta content="index, follow" name="robots"/>
   <link href="/resources/images/favicon.ico" rel="Shortcut icon" type="image/x-icon"/>
   <link href="/resources/images/

In [16]:
#Verify the Links
links = soup.find_all('a')
print(len(links))  # Print the number of <a> tags found
#146

146


In [17]:
for link in links:
    href = link.get('href', [])
    print(href)
    if '.pdf' in href:
        print("PDF link found:", href)

#main
#navigation
#navigation-mobile
#search-container
/
/contenu/menu/droit-national-en-vigueur
/contenu/menu/droit-national-en-vigueur/constitution
/loda/id/JORFTEXT000000571356/
/contenu/menu/droit-national-en-vigueur/constitution/declaration-des-droits-de-l-homme-et-du-citoyen-de-1789
/contenu/menu/droit-national-en-vigueur/constitution/preambule-de-la-constitution-du-27-octobre-1946
/contenu/menu/droit-national-en-vigueur/constitution/charte-de-l-environnement
/liste/code?etatTexte=VIGUEUR&etatTexte=VIGUEUR_DIFF&page=1#code
/search/lois?tab_selection=lawarticledecree&searchField=ALL&query=&page=1&init=true&dateSignature=&datePublication=
/contenu/menu/droit-national-en-vigueur/jurisprudence
/search/constit?tab_selection=constit&searchField=ALL&query=&page=1&init=true
/search/cetat?tab_selection=cetat&searchField=ALL&query=&page=1&init=true
https://www.legifrance.gouv.fr/ceta/planclassement
/search/juri?tab_selection=juri&searchField=ALL&query=&page=1&init=true
https://www.legifran

In [10]:
# Le Loi pdf URL 
#try #3 with FAO webpage
loi_url_3 = "https://www.fao.org/faolex/results/details/fr/c/LEX-FAOC143035/"

#Requests URL and get response object
#Check if the Request is Successful
loi_response3 = requests.get(loi_url_3)
print(loi_response3.status_code)  # Should be 200 for a successful request

#200!

200


In [11]:
#Verify the HTML Content
print(loi_response3.text)

<!DOCTYPE html>
<html lang="fr" xmlns="http://www.w3.org/1999/xhtml">
<head>

<meta charset="utf-8">
<!-- 
	FAO: Food and Agriculture Organization of the United Nations, for a world without hunger

	This website is powered by TYPO3 - inspiring people to share!
	TYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.
	TYPO3 is copyright 1998-2015 of Kasper Skaarhoj. Extensions are copyright of their respective owners.
	Information and contribution at http://typo3.org/
-->

<base href="https://www.fao.org/">
<link rel="shortcut icon" href="https://www.fao.org/fileadmin/templates/faoweb/images/icons/favicon.ico" type="image/x-icon; charset=binary">
<link rel="icon" href="https://www.fao.org/fileadmin/templates/faoweb/images/icons/favicon.ico" type="image/x-icon; charset=binary">

<meta name="generator" content="TYPO3 CMS">

<link rel="stylesheet" type="text/css" href="/typo3temp/stylesheet_5d370599a3.css?1460386828" media="

In [12]:
#Check the parsed HTML
soup = BeautifulSoup(loi_response3.text, 'html.parser')
print(soup.prettify())  # Pretty-print the HTML

<!DOCTYPE html>
<html lang="fr" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta charset="utf-8"/>
  <!-- 
	FAO: Food and Agriculture Organization of the United Nations, for a world without hunger

	This website is powered by TYPO3 - inspiring people to share!
	TYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.
	TYPO3 is copyright 1998-2015 of Kasper Skaarhoj. Extensions are copyright of their respective owners.
	Information and contribution at http://typo3.org/
-->
  <base href="https://www.fao.org/"/>
  <link href="https://www.fao.org/fileadmin/templates/faoweb/images/icons/favicon.ico" rel="shortcut icon" type="image/x-icon; charset=binary"/>
  <link href="https://www.fao.org/fileadmin/templates/faoweb/images/icons/favicon.ico" rel="icon" type="image/x-icon; charset=binary"/>
  <meta content="TYPO3 CMS" name="generator"/>
  <link href="/typo3temp/stylesheet_5d370599a3.css?1460386828" media="all" rel="stylesh

In [13]:
#Check if any <a> tags are found:
links = soup.find_all('a')
print(len(links))  # Print the number of <a> tags found

#51

51
