# Herbarium 2022 - FGVC9

We will begin by importing files one-by-one from the competition

Reference: https://technowhisp.com/kaggle-api-python-documentation/

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

# get the competition
# comp = api.competitions_list(search='herbarium-2022-fgvc9')[0]
# print(comp.ref,comp.reward,comp.userRank,sep=',')

comp = 'sorghum-id-fgvc-9'

Format of a file

In [None]:
filelist = api.competitions_data_list_files(comp)
filelist[0]

{'nameNullable': 'test/1001197278.png',
 'descriptionNullable': None,
 'urlNullable': 'https://www.kaggle.com/',
 'ref': 'test/1001197278.png',
 'name': 'test/1001197278.png',
 'hasName': True,
 'description': '',
 'hasDescription': False,
 'totalBytes': 1741711,
 'url': 'https://www.kaggle.com/',
 'hasUrl': True,
 'creationDate': '2022-03-24T19:31:38.563247Z'}

In [None]:
from pathlib import Path
import zipfile

dataf = Path('data')

def unzip_if_zipped(fpath):
    fpath = f'{fpath}.zip'

    try:
        with zipfile.ZipFile(fpath, 'r') as zip_ref:
            zip_ref.extractall(Path(*Path(fpath).parts[:-1]))
        
        Path(fpath).unlink()
    
    except FileNotFoundError:
        return

# annoyingly, kaggle API doesn't offer auto-unzip for competition download--if they decided to zip it: https://github.com/Kaggle/kaggle-api/pull/231
# so we auto-unzip ourselves...
# Download is skipped if file is already present on the system
def downloadf(dataf, comp, fref):
    if not (dataf / fref).exists():
        path = dataf / Path(*Path(fref).parts[:-1])
        api.competition_download_file(comp, fref, path=path)
        unzip_if_zipped(dataf / fref)
        return True
    
    return False

# here's an example
downloadf(dataf, comp, 'test/1001197278.png')

False

Download test metadata

In [None]:
# downloadf(dataf, comp, 'test_metadata.json')

False

In [None]:
downloadf(dataf, comp, 'train_cultivar_mapping.csv')

False

Import the `.csv` for `train_cultivar_mapping.csv`

In [None]:
import pandas as pd

train_metadata = None

with open(Path(dataf / 'train_cultivar_mapping.csv')) as fp:
    train_metadata = pd.read_csv(fp)

train_metadata

Unnamed: 0,image,cultivar
0,2017-06-16__12-24-20-930.png,PI_257599
1,2017-06-02__16-48-57-866.png,PI_154987
2,2017-06-12__13-18-07-707.png,PI_92270
3,2017-06-22__13-18-06-841.png,PI_152651
4,2017-06-26__12-56-48-642.png,PI_176766
...,...,...
22189,2017-06-16__12-27-16-266.png,PI_170787
22190,2017-06-28__11-19-57-454.png,PI_156393
22191,2017-06-28__10-20-32-417.png,PI_152923
22192,2017-06-28__12-47-02-714.png,PI_257599


Download the dataset (**⚠️ WARNING: VERY LARGE! ⚠️**) in no particular order

In [None]:
from kaggle.rest import ApiException 
import time

# size of our test data set
# we're imposing a limit for now since the full data set is way too large to be fit on our computer
test_m = 400

success = 1

train_metadata_subset = pd.DataFrame(columns=['image', 'cultivar'])

for i, imgref in enumerate(train_metadata['image']):

    if i >= test_m:
        break
    
    print(f'Downloading image #{i}')
    # that's the file structure they went for for this competition 
    if downloadf(dataf, comp, f"train_images/{imgref}"):
        # download was successful
        pass
    else:
        print('skipped')
    
    # add this entry to the subset metadata
    train_metadata_subset.loc[i] = train_metadata.loc[i]

Downloading image #0
skipped
Downloading image #1
skipped
Downloading image #2
skipped
Downloading image #3
skipped
Downloading image #4
skipped
Downloading image #5
skipped
Downloading image #6
skipped
Downloading image #7
skipped
Downloading image #8
skipped
Downloading image #9
skipped
Downloading image #10
skipped
Downloading image #11
skipped
Downloading image #12
skipped
Downloading image #13
skipped
Downloading image #14
skipped
Downloading image #15
skipped
Downloading image #16
skipped
Downloading image #17
skipped
Downloading image #18
skipped
Downloading image #19
skipped
Downloading image #20
skipped
Downloading image #21
skipped
Downloading image #22
skipped
Downloading image #23
skipped
Downloading image #24
skipped
Downloading image #25
skipped
Downloading image #26
skipped
Downloading image #27
skipped
Downloading image #28
skipped
Downloading image #29
skipped
Downloading image #30
skipped
Downloading image #31
skipped
Downloading image #32
skipped
Downloading image #3

KeyboardInterrupt: 

In [None]:
print(train_metadata_subset)
train_metadata_subset.to_csv(dataf / 'train_metadata_subset.csv', index=False)

                            image   cultivar
0    2017-06-16__12-24-20-930.png  PI_257599
1    2017-06-02__16-48-57-866.png  PI_154987
2    2017-06-12__13-18-07-707.png   PI_92270
3    2017-06-22__13-18-06-841.png  PI_152651
4    2017-06-26__12-56-48-642.png  PI_176766
..                            ...        ...
160  2017-06-01__10-41-04-558.png  PI_152751
161  2017-06-21__11-34-51-977.png  PI_251672
162  2017-06-02__18-15-54-053.png   PI_52606
163  2017-06-18__14-41-59-638.png  PI_152771
164  2017-06-19__15-57-59-480.png  PI_152771

[165 rows x 2 columns]


In [None]:
api.kernels_push('kevinlee333/rookie-draft-0', path='.', metadata=True)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d841b2e3-7f2f-42e6-ae8e-6cea1c0a3631' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>