In [3]:
from lxml import html
import requests
import re
import os

In [4]:
def create_folder_if_not_exists(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"The folder '{folder_name}' has been created.")
    else:
        print(f"The folder '{folder_name}' already exists.")

In [5]:
def download_files(home, json_pages):
    for year in json_pages:
        url = home + json_pages[year]
        print('Trying to download {} -> {}'.format(url, year))
        try:
            response = requests.get(url)
            if response.status_code == 200:
                create_folder_if_not_exists("./files/{year}".format(year = year))
                with open("./files/{year}/{year}.rar".format(year = year), "wb") as file:
                    file.write(response.content)
                    print("File downloaded successfully!")
            else:
                print("Failed to download the file.")
        except Exception as e:
                print(f"Failed to open the menu. \n Error: {e}")

In [6]:
page = requests.get('https://datos.profeco.gob.mx/datos_abiertos/qqp.php')
tree = html.fromstring(page.content)

In [7]:
links = tree.xpath('/html/body/main/div/div//@href')
links

['file.php?t=ac07fbc4a9fd1d925384aff634f11071',
 'file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 'file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 'file.php?t=493b83b886f0266909d783fc8f776b11',
 'file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 'file.php?t=09939d92d2afcde64dbc06e057877e16',
 'file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 'file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 'file.php?t=c388a30cb3f4b4c4fa29302618ef5557',
 'file.php?t=4ecfa981c01e742a5461bf543a7b4108']

In [8]:
names = tree.xpath('/html/body/main/div/div//a/text()')
names

['Base de Datos Histórica Quién es Quién en los Precios 2024',
 'Base de Datos Histórica Quién es Quién en los Precios 2023',
 'Base de Datos Histórica Quién es Quién en los Precios 2022',
 'Base de Datos Histórica Quién es Quién en los Precios 2021',
 'Base de Datos Histórica Quién es Quién en los Precios 2020',
 'Base de Datos Histórica Quién es Quién en los Precios 2019',
 'Base de Datos Histórica Quién es Quién en los Precios 2018',
 'Base de Datos Histórica Quién es Quién en los Precios 2017',
 'Base de Datos Histórica Quién es Quién en los Precios 2016',
 'Base de Datos Histórica Quién es Quién en los Precios 2015']

In [9]:
clean_year = ['QQP_'+re.findall(r'\d+',s)[0] for s in names]
clean_year

['QQP_2024',
 'QQP_2023',
 'QQP_2022',
 'QQP_2021',
 'QQP_2020',
 'QQP_2019',
 'QQP_2018',
 'QQP_2017',
 'QQP_2016',
 'QQP_2015']

In [10]:
pages = dict(zip(clean_year,links))
pages

{'QQP_2024': 'file.php?t=ac07fbc4a9fd1d925384aff634f11071',
 'QQP_2023': 'file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 'QQP_2022': 'file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 'QQP_2021': 'file.php?t=493b83b886f0266909d783fc8f776b11',
 'QQP_2020': 'file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 'QQP_2019': 'file.php?t=09939d92d2afcde64dbc06e057877e16',
 'QQP_2018': 'file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 'QQP_2017': 'file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 'QQP_2016': 'file.php?t=c388a30cb3f4b4c4fa29302618ef5557',
 'QQP_2015': 'file.php?t=4ecfa981c01e742a5461bf543a7b4108'}

In [11]:
for year in pages:
    print(year, "->", pages[year])

QQP_2024 -> file.php?t=ac07fbc4a9fd1d925384aff634f11071
QQP_2023 -> file.php?t=f3270d227f2966e6138a3ed41a9bbfb7
QQP_2022 -> file.php?t=af88f42c5cb82c6c35dd962b1ae69051
QQP_2021 -> file.php?t=493b83b886f0266909d783fc8f776b11
QQP_2020 -> file.php?t=4df382eefa26f1f0d28d3a11aaf41add
QQP_2019 -> file.php?t=09939d92d2afcde64dbc06e057877e16
QQP_2018 -> file.php?t=01fafa951fb6c82e6e4bb491af8f1688
QQP_2017 -> file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70
QQP_2016 -> file.php?t=c388a30cb3f4b4c4fa29302618ef5557
QQP_2015 -> file.php?t=4ecfa981c01e742a5461bf543a7b4108


In [12]:
home = 'https://datos.profeco.gob.mx/datos_abiertos/'

In [13]:
!ls

bin	   dev	   home   lib64       media  proc  sbin  sys  var
boot	   etc	   lib	  libx32      mnt    root  snap  tmp
copyright  hadoop  lib32  lost+found  opt    run   srv	 usr


In [14]:
%%time
download_files(home, pages)

Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=ac07fbc4a9fd1d925384aff634f11071 -> QQP_2024
The folder './files/QQP_2024' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=f3270d227f2966e6138a3ed41a9bbfb7 -> QQP_2023
The folder './files/QQP_2023' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=af88f42c5cb82c6c35dd962b1ae69051 -> QQP_2022
The folder './files/QQP_2022' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=493b83b886f0266909d783fc8f776b11 -> QQP_2021
The folder './files/QQP_2021' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=4df382eefa26f1f0d28d3a11aaf41add -> QQP_2020
The folder './files/QQP_2020' has been created.
File downloaded successfully!
Trying to downl

In [15]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
/dev/root        97G   20G   78G  20% /
tmpfs           7.9G     0  7.9G   0% /dev/shm
tmpfs           3.2G  1.2M  3.2G   1% /run
tmpfs           5.0M     0  5.0M   0% /run/lock
tmpfs           4.0M     0  4.0M   0% /sys/fs/cgroup
efivarfs         56K   24K   27K  48% /sys/firmware/efi/efivars
/dev/sda15      105M  6.1M   99M   6% /boot/efi
tmpfs           1.6G  4.0K  1.6G   1% /run/user/115
tmpfs           1.6G  4.0K  1.6G   1% /run/user/117
tmpfs           1.6G  4.0K  1.6G   1% /run/user/116
tmpfs           1.6G  4.0K  1.6G   1% /run/user/119
tmpfs           1.6G  4.0K  1.6G   1% /run/user/126
tmpfs           1.6G  4.0K  1.6G   1% /run/user/120


In [16]:
! wget "https://raw.githubusercontent.com/Anonymate054/MCD-BigData/main/Scrapper/notebooks/shells/unrar_files.sh" -O unrar_files.sh

--2024-05-28 14:05:32--  https://raw.githubusercontent.com/Anonymate054/MCD-BigData/main/Scrapper/notebooks/shells/unrar_files.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 933 [text/plain]
Saving to: ‘unrar_files.sh’


2024-05-28 14:05:32 (65.8 MB/s) - ‘unrar_files.sh’ saved [933/933]



In [29]:
%%time
! chmod +x unrar_files.sh
! ./unrar_files.sh files

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
rar is already the newest version (2:5.5.0-1.1).
unrar is already the newest version (1:6.1.5-1).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.

UNRAR 6.11 beta 1 freeware      Copyright (c) 1993-2022 Alexander Roshal


Extracting from files/QQP_2015/QQP_2015.rar

Extracting  files/QQP_2015/csv/012015.csv                                0  OK 
Extracting  files/QQP_2015/csv/022015.csv                                2  OK 
Extracting  files/QQP_2015/csv/032015.csv                                3  OK 
Extracting  files/QQP_2015/csv/042015.csv                                5  OK 
Extracting  files/QQP_2015/csv/052015.csv                                7  OK 
Extracting  files/QQP_2015/csv/062015.csv                                9  OK 
Extracting  files/QQP_2015/csv/072015.csv                               10  OK 
Extracting  files/QQP_2015/csv/082015.csv                    

In [18]:
!wget "https://raw.githubusercontent.com/Anonymate054/MCD-BigData/main/Scrapper/notebooks/shells/hdfs_files.sh" -O hdfs_files.sh

--2024-05-28 14:05:33--  https://raw.githubusercontent.com/Anonymate054/MCD-BigData/main/Scrapper/notebooks/shells/hdfs_files.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 755 [text/plain]
Saving to: ‘hdfs_files.sh’


2024-05-28 14:05:33 (52.9 MB/s) - ‘hdfs_files.sh’ saved [755/755]



In [None]:
%%time
!chmod a+x hdfs_files.sh
!./hdfs_files.sh files /user/QQP  

Copiando files/QQP_2022/csv/052022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/062022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/032022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/272022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/452022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/262022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/072022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/162022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/082022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/372022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/182022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/242022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/022022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/292022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/502022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/352022.csv a HDFS en /user/QQP
Copiando files/QQP_2022/csv/312022.csv a HDFS en /user/Q

In [None]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
/dev/root        97G   60G   38G  62% /
tmpfs           7.9G     0  7.9G   0% /dev/shm
tmpfs           3.2G  1.2M  3.2G   1% /run
tmpfs           5.0M     0  5.0M   0% /run/lock
tmpfs           4.0M     0  4.0M   0% /sys/fs/cgroup
efivarfs         56K   24K   27K  48% /sys/firmware/efi/efivars
/dev/sda15      105M  6.1M   99M   6% /boot/efi
tmpfs           1.6G  4.0K  1.6G   1% /run/user/115
tmpfs           1.6G  4.0K  1.6G   1% /run/user/117
tmpfs           1.6G  4.0K  1.6G   1% /run/user/116
tmpfs           1.6G  4.0K  1.6G   1% /run/user/119
tmpfs           1.6G  4.0K  1.6G   1% /run/user/126
tmpfs           1.6G  4.0K  1.6G   1% /run/user/120
