In [2]:
from lxml import html
import requests
import re
import os

In [2]:
page = requests.get('https://datos.profeco.gob.mx/datos_abiertos/qqp.php')
tree = html.fromstring(page.content)

In [66]:
links = tree.xpath('/html/body/main/div/div//@href')
links

['file.php?t=ac07fbc4a9fd1d925384aff634f11071',
 'file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 'file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 'file.php?t=493b83b886f0266909d783fc8f776b11',
 'file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 'file.php?t=09939d92d2afcde64dbc06e057877e16',
 'file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 'file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 'file.php?t=c388a30cb3f4b4c4fa29302618ef5557',
 'file.php?t=4ecfa981c01e742a5461bf543a7b4108']

In [17]:
names = tree.xpath('/html/body/main/div/div//a/text()')
names

['Base de Datos Histórica Quién es Quién en los Precios 2024',
 'Base de Datos Histórica Quién es Quién en los Precios 2023',
 'Base de Datos Histórica Quién es Quién en los Precios 2022',
 'Base de Datos Histórica Quién es Quién en los Precios 2021',
 'Base de Datos Histórica Quién es Quién en los Precios 2020',
 'Base de Datos Histórica Quién es Quién en los Precios 2019',
 'Base de Datos Histórica Quién es Quién en los Precios 2018',
 'Base de Datos Histórica Quién es Quién en los Precios 2017',
 'Base de Datos Histórica Quién es Quién en los Precios 2016',
 'Base de Datos Histórica Quién es Quién en los Precios 2015']

In [37]:
clean_year = ['QQP_'+re.findall(r'\d+',s)[0] for s in names]
clean_year

['QQP_2024',
 'QQP_2023',
 'QQP_2022',
 'QQP_2021',
 'QQP_2020',
 'QQP_2019',
 'QQP_2018',
 'QQP_2017',
 'QQP_2016',
 'QQP_2015']

In [54]:
pages = dict(zip(clean_year,links))
pages

{'QQP_2024': 'file.php?t=ac07fbc4a9fd1d925384aff634f11071',
 'QQP_2023': 'file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 'QQP_2022': 'file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 'QQP_2021': 'file.php?t=493b83b886f0266909d783fc8f776b11',
 'QQP_2020': 'file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 'QQP_2019': 'file.php?t=09939d92d2afcde64dbc06e057877e16',
 'QQP_2018': 'file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 'QQP_2017': 'file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 'QQP_2016': 'file.php?t=c388a30cb3f4b4c4fa29302618ef5557',
 'QQP_2015': 'file.php?t=4ecfa981c01e742a5461bf543a7b4108'}

In [56]:
def create_folder_if_not_exists(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"The folder '{folder_name}' has been created.")
    else:
        print(f"The folder '{folder_name}' already exists.")

In [65]:
for year in pages:
    print(year, "->", pages[year])

QQP_2024 -> file.php?t=ac07fbc4a9fd1d925384aff634f11071
QQP_2023 -> file.php?t=f3270d227f2966e6138a3ed41a9bbfb7
QQP_2022 -> file.php?t=af88f42c5cb82c6c35dd962b1ae69051
QQP_2021 -> file.php?t=493b83b886f0266909d783fc8f776b11
QQP_2020 -> file.php?t=4df382eefa26f1f0d28d3a11aaf41add
QQP_2019 -> file.php?t=09939d92d2afcde64dbc06e057877e16
QQP_2018 -> file.php?t=01fafa951fb6c82e6e4bb491af8f1688
QQP_2017 -> file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70
QQP_2016 -> file.php?t=c388a30cb3f4b4c4fa29302618ef5557
QQP_2015 -> file.php?t=4ecfa981c01e742a5461bf543a7b4108


In [None]:
home = 'https://datos.profeco.gob.mx/datos_abiertos/'

In [None]:
def download_files(home, json_pages):
    for year in json_pages:
        url = home + json_pages[year]
        print('Trying to download {} -> {}'.format(url, year))
        try:
            response = requests.get(url)
            if response.status_code == 200:
                create_folder_if_not_exists("files/{year}".format(year = year))
                with open("files/{year}/{year}.rar".format(year = year), "wb") as file:
                    file.write(response.content)
                    print("File downloaded successfully!")
            else:
                print("Failed to download the file.")
        except Exception as e:
                print(f"Failed to open the menu. \n Error: {e}")

In [67]:
download_files(home, pages)

Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=ac07fbc4a9fd1d925384aff634f11071
The folder 'files/QQP_2024' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=f3270d227f2966e6138a3ed41a9bbfb7
The folder 'files/QQP_2023' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=af88f42c5cb82c6c35dd962b1ae69051
The folder 'files/QQP_2022' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=493b83b886f0266909d783fc8f776b11
The folder 'files/QQP_2021' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=4df382eefa26f1f0d28d3a11aaf41add
The folder 'files/QQP_2020' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=09939d92d2a