In [None]:
#| default_exp datasets

In [None]:
%load_ext autoreload
%autoreload 2

# ez_kaggle.datasets

> API details for ez_kaggle.datasets

In [None]:
#|hide
from nbdev.showdoc import *

In [None]:
#|export
import json, os
from pathlib import Path
from ez_kaggle.core import *
from ez_kaggle.competition import *


In [None]:
from fastcore.foundation import L

In [None]:
#| export
def ds_exists(dataset_slug, # Dataset slug (ie "zillow/zecon")
                   path='.'):
    md_path = Path(Path(path)/'dataset-metadata.json')
    assert not md_path.exists(),'dataset-metadata.json already exists. Use a path that is not a kaggle dataset'
    try: 
        api=import_kaggle()
        api.dataset_metadata(dataset_slug,path)
        md_path.unlink()
        return True
    except Exception as ex:
        if '404' in str(ex): return False
        else: raise ex  

In [None]:
assert ds_exists('isaacflath/library-fastkaggle')
assert not ds_exists('not/real/dataset')

In [None]:
#| export
def mk_dataset(dataset_path, # Local path to create dataset in
               title, # Name of the dataset
               force=False, # Should it overwrite or error if exists?
               upload=True, # Should it upload and create on kaggle
               cfg_path='.',
               **kwargs # Config dict to overwrite or replace fastkaggle.json
              ):
    '''Creates minimal dataset metadata needed to push new dataset to kaggle'''
    cfg = get_config_values(cfg_path,**kwargs)
    dataset_path = Path(dataset_path)
    dataset_path.mkdir(exist_ok=force,parents=True)
    api = import_kaggle()
    api.dataset_initialize(dataset_path)
    md = json.load(open(dataset_path/'dataset-metadata.json'))
    md['title'] = title
    md['id'] = md['id'].replace('INSERT_SLUG_HERE',title)
    json.dump(md,open(dataset_path/'dataset-metadata.json','w'))
    if upload: (dataset_path/'empty.txt').touch()
    api.dataset_create_new(str(dataset_path),public=True,dir_mode='zip',quiet=True)

In [None]:
mk_dataset('./testds','mytestds',force=True,upload=False)
path = Path('./testds/dataset-metadata.json')
md = json.load(open(path))
assert md['title'] == 'mytestds'
assert md['id'].endswith('/mytestds')
path.unlink()
path.parent.rmdir()

Data package template written to: testds/dataset-metadata.json


In [None]:
#| export
def get_dataset(dataset_slug, # Dataset slug (ie "zillow/zecon")
                dataset_path, # Local path to download dataset to
                unzip=True, # Should it unzip after downloading?
                force=False # Should it overwrite or error if dataset_path exists?
               ):
    '''Downloads an existing dataset and metadata from kaggle'''
    if not force: assert not Path(dataset_path).exists()
    api = import_kaggle()
    api.dataset_metadata(dataset_slug,str(dataset_path))
    api.dataset_download_files(dataset_slug,str(dataset_path))
    if unzip:
        zipped_file = Path(dataset_path)/f"{dataset_slug.split('/')[-1]}.zip"
        import zipfile
        with zipfile.ZipFile(zipped_file, 'r') as zip_ref:
            zip_ref.extractall(Path(dataset_path))
        zipped_file.unlink()

In [None]:
dataset_path = Path('./data-science-job-salaries')
get_dataset('ruchi798/data-science-job-salaries',dataset_path, force=True)

files = os.listdir(dataset_path)

assert L(files).sorted() == ['dataset-metadata.json', 'ds_salaries.csv']

for f in Path(dataset_path).ls(): f.unlink()
Path(dataset_path).rmdir()

In [None]:
#| export
def push_dataset(dataset_path, # Local path where dataset is stored 
                 version_comment, # Comment associated with this dataset update
                quiet=True
                ):
    '''Push dataset update to kaggle.  Dataset path must contain dataset metadata file'''
    api = import_kaggle()
    api.dataset_create_version(str(dataset_path),version_comment,dir_mode='zip',quiet=quiet)

## Export -

In [None]:
#|hide
#|eval: false
from nbdev.doclinks import nbdev_export
nbdev_export()