In [1]:
from lxml import html
import requests
import re
import os

In [2]:
def create_folder_if_not_exists(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"The folder '{folder_name}' has been created.")
    else:
        print(f"The folder '{folder_name}' already exists.")

In [14]:
def download_files(home, json_pages):
    for year in json_pages:
        url = home + json_pages[year]
        print('Trying to download {} -> {}'.format(url, year))
        try:
            response = requests.get(url)
            if response.status_code == 200:
                create_folder_if_not_exists("./files/{year}".format(year = year))
                with open("./files/{year}/{year}.rar".format(year = year), "wb") as file:
                    file.write(response.content)
                    print("File downloaded successfully!")
            else:
                print("Failed to download the file.")
        except Exception as e:
                print(f"Failed to open the menu. \n Error: {e}")

In [3]:
page = requests.get('https://datos.profeco.gob.mx/datos_abiertos/qqp.php')
tree = html.fromstring(page.content)

In [4]:
links = tree.xpath('/html/body/main/div/div//@href')
links

['file.php?t=ac07fbc4a9fd1d925384aff634f11071',
 'file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 'file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 'file.php?t=493b83b886f0266909d783fc8f776b11',
 'file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 'file.php?t=09939d92d2afcde64dbc06e057877e16',
 'file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 'file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 'file.php?t=c388a30cb3f4b4c4fa29302618ef5557',
 'file.php?t=4ecfa981c01e742a5461bf543a7b4108']

In [5]:
names = tree.xpath('/html/body/main/div/div//a/text()')
names

['Base de Datos Histórica Quién es Quién en los Precios 2024',
 'Base de Datos Histórica Quién es Quién en los Precios 2023',
 'Base de Datos Histórica Quién es Quién en los Precios 2022',
 'Base de Datos Histórica Quién es Quién en los Precios 2021',
 'Base de Datos Histórica Quién es Quién en los Precios 2020',
 'Base de Datos Histórica Quién es Quién en los Precios 2019',
 'Base de Datos Histórica Quién es Quién en los Precios 2018',
 'Base de Datos Histórica Quién es Quién en los Precios 2017',
 'Base de Datos Histórica Quién es Quién en los Precios 2016',
 'Base de Datos Histórica Quién es Quién en los Precios 2015']

In [6]:
clean_year = ['QQP_'+re.findall(r'\d+',s)[0] for s in names]
clean_year

['QQP_2024',
 'QQP_2023',
 'QQP_2022',
 'QQP_2021',
 'QQP_2020',
 'QQP_2019',
 'QQP_2018',
 'QQP_2017',
 'QQP_2016',
 'QQP_2015']

In [7]:
pages = dict(zip(clean_year,links))
pages

{'QQP_2024': 'file.php?t=ac07fbc4a9fd1d925384aff634f11071',
 'QQP_2023': 'file.php?t=f3270d227f2966e6138a3ed41a9bbfb7',
 'QQP_2022': 'file.php?t=af88f42c5cb82c6c35dd962b1ae69051',
 'QQP_2021': 'file.php?t=493b83b886f0266909d783fc8f776b11',
 'QQP_2020': 'file.php?t=4df382eefa26f1f0d28d3a11aaf41add',
 'QQP_2019': 'file.php?t=09939d92d2afcde64dbc06e057877e16',
 'QQP_2018': 'file.php?t=01fafa951fb6c82e6e4bb491af8f1688',
 'QQP_2017': 'file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70',
 'QQP_2016': 'file.php?t=c388a30cb3f4b4c4fa29302618ef5557',
 'QQP_2015': 'file.php?t=4ecfa981c01e742a5461bf543a7b4108'}

In [8]:
for year in pages:
    print(year, "->", pages[year])

QQP_2024 -> file.php?t=ac07fbc4a9fd1d925384aff634f11071
QQP_2023 -> file.php?t=f3270d227f2966e6138a3ed41a9bbfb7
QQP_2022 -> file.php?t=af88f42c5cb82c6c35dd962b1ae69051
QQP_2021 -> file.php?t=493b83b886f0266909d783fc8f776b11
QQP_2020 -> file.php?t=4df382eefa26f1f0d28d3a11aaf41add
QQP_2019 -> file.php?t=09939d92d2afcde64dbc06e057877e16
QQP_2018 -> file.php?t=01fafa951fb6c82e6e4bb491af8f1688
QQP_2017 -> file.php?t=059e79ffa462f6f51ed3aa1dbfa83a70
QQP_2016 -> file.php?t=c388a30cb3f4b4c4fa29302618ef5557
QQP_2015 -> file.php?t=4ecfa981c01e742a5461bf543a7b4108


In [10]:
home = 'https://datos.profeco.gob.mx/datos_abiertos/'

In [13]:
!ls

bin	   dev	  hadoop  lib32   lost+found  opt   run   srv  usr
boot	   etc	  home	  lib64   media       proc  sbin  sys  var
copyright  files  lib	  libx32  mnt	      root  snap  tmp  war_tweets.txt


In [15]:
%%time
download_files(home, pages)

Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=ac07fbc4a9fd1d925384aff634f11071 -> QQP_2024
The folder './files/QQP_2024' already exists.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=f3270d227f2966e6138a3ed41a9bbfb7 -> QQP_2023
The folder './files/QQP_2023' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=af88f42c5cb82c6c35dd962b1ae69051 -> QQP_2022
The folder './files/QQP_2022' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=493b83b886f0266909d783fc8f776b11 -> QQP_2021
The folder './files/QQP_2021' has been created.
File downloaded successfully!
Trying to download https://datos.profeco.gob.mx/datos_abiertos/file.php?t=4df382eefa26f1f0d28d3a11aaf41add -> QQP_2020
The folder './files/QQP_2020' has been created.
File downloaded successfully!
Trying to downloa

In [16]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
/dev/root        97G   44G   54G  45% /
tmpfs           7.9G     0  7.9G   0% /dev/shm
tmpfs           3.2G  1.2M  3.2G   1% /run
tmpfs           5.0M     0  5.0M   0% /run/lock
tmpfs           4.0M     0  4.0M   0% /sys/fs/cgroup
efivarfs         56K   24K   27K  48% /sys/firmware/efi/efivars
/dev/sda15      105M  6.1M   99M   6% /boot/efi
tmpfs           1.6G  4.0K  1.6G   1% /run/user/117
tmpfs           1.6G  4.0K  1.6G   1% /run/user/116
tmpfs           1.6G  4.0K  1.6G   1% /run/user/115
tmpfs           1.6G  4.0K  1.6G   1% /run/user/119
tmpfs           1.6G  4.0K  1.6G   1% /run/user/126
tmpfs           1.6G  4.0K  1.6G   1% /run/user/120


In [None]:
! wget "https://raw.githubusercontent.com/Anonymate054/MCD-BigData/main/Scrapper/notebooks/shells/unrar_files.sh" -O unrar_files.sh

In [27]:
%%time
! chmod a+x unrar_files.sh
! ./unrar_files.sh files

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  rar unrar
0 upgraded, 2 newly installed, 0 to remove and 3 not upgraded.
Need to get 485 kB of archives.
After this operation, 1541 kB of additional disk space will be used.
Get:1 http://us-central1.gce.archive.ubuntu.com/ubuntu jammy/multiverse amd64 rar amd64 2:5.5.0-1.1 [341 kB]
Get:2 http://us-central1.gce.archive.ubuntu.com/ubuntu jammy/multiverse amd64 unrar amd64 1:6.1.5-1 [145 kB]
Fetched 485 kB in 0s (4596 kB/s)[0m[33m

7[0;23r8[1ASelecting previously unselected package rar.
(Reading database ... 224852 files and directories currently installed.)
Preparing to unpack .../rar_2%3a5.5.0-1.1_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m[30mProgress: [ 11%][49m[39m [######....................................................] 8Unpacking rar 

In [None]:
!wget "https://raw.githubusercontent.com/Anonymate054/MCD-BigData/main/Scrapper/notebooks/shells/hdfs_files.sh" -O hdfs_files.sh

In [None]:
%%time
!chmod a+x hdfs_files.sh
!./hdfs_files.sh files /user/QQP  

Copying files/QQP_2022/csv/052022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/062022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/032022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/272022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/452022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/262022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/072022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/162022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/082022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/372022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/182022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/242022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/022022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/292022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/502022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/352022.csv to HDFS at /user/QQP
Copying files/QQP_2022/csv/312022.csv to HDFS at /user/Q

In [None]:
!df -h