# 1. Bibliotecas

In [0]:
%run /Workspace/Users/matheusfelipebasso@hotmail.com/Datum/utils/Now

In [0]:
import os
import time
import tempfile

---
# 2. Classe

In [0]:
class GetKaggleData(Now):

    _SHOW_LOG = True
    _START_TIME = datetime.now()

    # ######################################################################################################################
    def __init__(self, username: str = 'DatumKaggleUsername', 
                 key: str = 'DatumKaggleKey', 
                 dataset: str = 'olistbr/brazilian-ecommerce'):
        
        # ------------------------------------------------------------------------------------------------------------------
        print(f"=".center(120, "="))
        print(f"{' '*20} ___   _ _____ _   _ __  __   _  __  _   ___  ___ _    ___   ___   _ _____ _   ")
        print(f"{' '*20}|   \\ /_|_   _| | | |  \\/  | | |/ / /_\\ / __|/ __| |  | __| |   \\ /_|_   _/_\\  ")
        print(f"{' '*20}| |) / _ \\| | | |_| | |\\/| | | ' < / _ | (_ | (_ | |__| _|  | |) / _ \\| |/ _ \\ ")
        print(f"{' '*20}|___/_/ \\_|_|  \\___/|_|  |_| |_|\\_/_/ \\_\\___|\\___|____|___| |___/_/ \\_|_/_/ \\_\\     version 0.0.2")
        print(f"=".center(120, '='))
        print('\n')
        # ------------------------------------------------------------------------------------------------------------------
        self._user = dbutils.secrets.get(scope='Datum', key=username)
        os.environ['KAGGLE_USERNAME'] = self._user
        # ------------------------------------------------------------------------------------------------------------------
        self._key = dbutils.secrets.get(scope='Datum', key=key)
        os.environ['KAGGLE_KEY'] = self._key
        # ------------------------------------------------------------------------------------------------------------------
        self._api = self._set_api()
        self._dataset = self._validate_dataset(dataset)
        # ------------------------------------------------------------------------------------------------------------------
        print(f' Verificacao de Parâmetros Informados '.center(120,'='))
        print(f'OK | Username: {username}')
        print(f'OK | Key: {key}')
        print(f'OK | Dataset: {dataset}')
        print(f'OK | API')
        print(f'='.center(120,'='))
    
    # ######################################################################################################################
    def _set_api(self):

        from kaggle.api.kaggle_api_extended import KaggleApi

        self.log_message(show=self._SHOW_LOG, message='Autenticando API', start=True)

        k_api = KaggleApi()
        k_api.authenticate()

        self.log_message(show=self._SHOW_LOG, message='Autenticando API | OK', end=True)

        return k_api
    
    # ######################################################################################################################
    def _validate_dataset(self, dataset):

        # ------------------------------------------------------------------------------------------------------------------
        self.log_message(show=self._SHOW_LOG, message=f'Autenticando Dataset | {dataset}', start=True)

        k_data_sets = self._api.dataset_list(search=dataset)
        for k_dataset in k_data_sets:
            if k_dataset.ref == dataset:
                self.log_message(show=self._SHOW_LOG, message=f'Autenticando Dataset | {dataset} | OK', end=True)
                return k_dataset
        
        raise ValueError(f'Dataset "{dataset}" não localizado!')

    # ######################################################################################################################
    @staticmethod
    def _check_dirs(dbfs_location: str, auto_create_dir: bool) -> None:

        # ------------------------------------------------------------------------------------------------------------------
        _bronze_dir = dbfs_location.split('/')
        if 'dbfs' in _bronze_dir[0]:
            _bronze_dir = "/".join(['/dbfs'] + _bronze_dir[1:])
        else:
            raise ValueError(f'dbfs não localizado no dir {dbfs_location}.')
        # ------------------------------------------------------------------------------------------------------------------        
        _bronze_dir_old_files = _bronze_dir.split('/')
        if 'bronze' in str(_bronze_dir_old_files[-1]).lower():
            _bronze_dir_old_files = '/'.join(
                _bronze_dir_old_files[1:-2] + [_bronze_dir_old_files[-2] + 'OldFiles'] + [_bronze_dir_old_files[-1]])
        else:
            raise ValueError(f'bronze não localizado ao final do dir {dbfs_location}')
        # ------------------------------------------------------------------------------------------------------------------        
        for k_d_dir in [_bronze_dir, _bronze_dir_old_files]:
            try:
                dbutils.fs.ls(k_d_dir)
                print(f'OK | DIR: "{k_d_dir}"')
            except Exception as e:
                if 'FileNotFoundException' in str(e):
                    
                    if auto_create_dir:
                        print('Dir não localizado, criando automaticamente.')
                        if dbutils.fs.mkdirs(k_d_dir):
                            print(f'Dir {k_d_dir} criado com sucesso!')
                        else:
                            raise ValueError(f'Erro ao criar o dir {k_d_dir}')
                    else:
                        raise ValueError('Dir não localizado, sem autorização para criação automática.')
                else:
                    raise e
        # ------------------------------------------------------------------------------------------------------------------

        return _bronze_dir, _bronze_dir_old_files
    
    # ######################################################################################################################
    def _download_dataset(self, dbfs_path: str) -> bool:

        with tempfile.TemporaryDirectory() as tempdir:
            self._api.dataset_download_files(str(self._dataset), path=tempdir, unzip=True)

            files = os.listdir(tempdir)

            n_files = len(files)
            for n, file in enumerate(files):
                tempdir_file = os.path.join(tempdir, file)
                dbfs_dir_file = os.path.join(dbfs_path, file)
                dbutils.fs.cp(f"file:{tempdir_file}",dbfs_dir_file)
                print(f"{n+1} | Arquivo {file} salvo | {dbfs_dir_file} | OK")

            return n_files

    # ######################################################################################################################
    def run(self, dbfs_location: str = 'dbfs:/FileStore/Datum/KaggleOlistBrData/bronze', auto_create_dir: bool = True):

        print('\n')
        self.log_message(show=self._SHOW_LOG, message=f'Início Execução', start=True, sep='=')
        print(f'{"="*120}\n')
        
        # ------------------------------------------------------------------------------------------------------------------
        self.log_message(show=self._SHOW_LOG, message=f'Verificando dirs', start=True)
        b_dir, b_old_dir = self._check_dirs(dbfs_location=dbfs_location, auto_create_dir=auto_create_dir)
        self.log_message(show=self._SHOW_LOG, message=f'Verificando dirs | OK', end=True)
        # ------------------------------------------------------------------------------------------------------------------

        self.log_message(show=self._SHOW_LOG, message=f'Verificando arquivos', start=True)

        n_files = len(dbutils.fs.ls(b_dir))
        _return = True

        if n_files == 0:
            self.log_message(show=self._SHOW_LOG, message=f'Verificando arquivos | NOT OK | Sem dados')
            n_files = self._download_dataset(dbfs_path=b_dir)
            self.log_message(show=self._SHOW_LOG, message=f'Verificando arquivos | OK', end=True)

        else:
            self.log_message(show=self._SHOW_LOG, message=f'Verificando última atualização')
            date_last_updated = max([datetime.fromtimestamp(file.modificationTime/1000) for file in dbutils.fs.ls('/dbfs/FileStore/Datum/KaggleOlistBrData/bronze/')])
            if self._dataset.lastUpdated > date_last_updated:
                self.log_message(show=self._SHOW_LOG, message=f'Novos registros')
                n_files = self._download_dataset(dbfs_path=b_dir)
                self.log_message(show=self._SHOW_LOG, message=f'Novos registros | OK')
            else:
                self.log_message(show=self._SHOW_LOG, message=f'Verificando última atualização | OK')
                self.log_message(show=self._SHOW_LOG, message=f'Verificando arquivos | OK', end=True)
                _return = False

        print(f'\n{"="*120}')
        self.log_message(show=self._SHOW_LOG, message=f'Final Execução', end=True, sep='=')

        print('\n')
        print(": Report :".center(120, '.'))
        print(f"Tempo de Execução: {time.strftime('%H:%M:%S', time.gmtime((datetime.now() - self._START_TIME).seconds))}")
        print(f"Arquivos Disponíveis na Bronze: {n_files}")
        print('.'*120)

        return _return


In [0]:
val_return = GetKaggleData().run()

                     ___   _ _____ _   _ __  __   _  __  _   ___  ___ _    ___   ___   _ _____ _   
                    |   \ /_|_   _| | | |  \/  | | |/ / /_\ / __|/ __| |  | __| |   \ /_|_   _/_\  
                    | |) / _ \| | | |_| | |\/| | | ' < / _ | (_ | (_ | |__| _|  | |) / _ \| |/ _ \ 
                    |___/_/ \_|_|  \___/|_|  |_| |_|\_/_/ \_\___|\___|____|___| |___/_/ \_|_/_/ \_\     version 0.0.2


------------------------------------------------------------------------------------------------------------------------
2024-04-14T17:31:34 | Autenticando API |--------------------------------------------------------------------------------
2024-04-14T17:31:34 | Autenticando API | OK |---------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------

In [0]:
if not val_return:
    dbutils.notebook.exit("Não há dados novos")