In [1]:
import pandas as pd
import json
from pathlib import Path
from IPython.display import display

In [168]:
root = Path.cwd().parent
data = root / 'data' / 'raw'

In [175]:
def parse_json(filepath, istest=False, categories=None):
    with open(filepath, 'r') as f:
        res = json.load(f)
    info = pd.DataFrame.from_dict(res['info'], orient='index')
    images = pd.DataFrame(res['images']).set_index('id')
    if not istest:
        annotations = pd.DataFrame(res['annotations']).set_index('id')
        if categories:
            categories = pd.DataFrame(res['categories']).set_index('id')
            return info, images, annotations, categories
        return info, images, annotations

    return info, images

def join_dataframes(images, annotations, categories, locations=None):
    categories = categories[categories['supercategory'] == 'Fungi']
    categories = categories.rename(columns={'id': 'category_id'})
    if locations is None:
        df = categories.merge(annotations, right_on='category_id', left_index=True, how='inner').merge(images, left_on='image_id', right_index=True, how='inner')
    else:
        df = categories.merge(annotations, right_on='category_id', left_index=True, how='inner').merge(images, left_on='image_id', right_index=True, how='inner').merge(locations, left_on='image_id', right_index=True, how='inner')
    try:
        df = df.drop(['supercategory', 'kingdom', 'image_id', 'valid', 'license', 'rights_holder', 'user_id'], axis=1)
    except KeyError:
        df = df.drop(['supercategory', 'kingdom', 'image_id', 'license', 'rights_holder'], axis=1)
    finally:
        return df

## Parse JSON Data

In [170]:
tinfo2018, timages2018, tanno2018 = parse_json(data / '2018' / 'train2018.json')
vinfo2018, vimages2018, vanno2018 = parse_json(data / '2018' / 'val2018.json')
testinfo2018, testimages2018 = parse_json(data / '2018' / 'test2018.json', istest=True)
tloc = pd.read_json(data / '2018' / 'inat2018_locations' / 'train2018_locations.json').set_index('id')
vloc = pd.read_json(data / '2018' / 'inat2018_locations' / 'val2018_locations.json').set_index('id')

with open(data / '2018' / 'categories.json', 'r') as f:
    res = json.load(f)
    cats = pd.DataFrame(res)

tinfo2019, timages2019, tanno2019, tcat2019 = parse_json(data / '2019' / 'train2019.json', categories=True)

tinfo2021, timages2021, tanno2021, tcat2021 = parse_json(data / '2021' / 'train.json', categories=True)
vinfo2021, vimages2021, vanno2021, vcat2021 = parse_json(data / '2021' / 'val.json', categories=True)
testinfo2021, testimages2021 = parse_json(data / '2021' / 'public_test.json', istest=True)

# 2018 Dataset

In [60]:
display(tloc)
display(timages2018)
display(tanno2018)
display(cats)

Unnamed: 0_level_0,loc_uncert,date,valid,user_id,lat,date_c,lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,2016-05-20,True,15723,38.913837,0.3819,-76.942313
2,42,2015-01-16,True,42760,50.578740,0.0412,6.374398
3,12,2016-09-08,True,121416,49.269860,0.6868,-123.260819
4,500,2011-02-05,True,16823,35.131031,0.0962,-106.682600
5,173,2015-05-15,True,40730,19.503475,0.3681,-99.142810
...,...,...,...,...,...,...,...
437509,91,2016-06-01,True,178946,46.232355,0.4148,-123.398609
437510,4,2015-07-13,True,22589,32.559797,0.5302,-97.108282
437511,124,2016-02-14,True,54888,20.870175,0.1209,-87.087176
437512,0,2013-06-10,True,11792,39.599833,0.4396,-82.630000


Unnamed: 0_level_0,license,file_name,rights_holder,height,width
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,train_val2018/Plantae/7477/3b60c9486db1d2ee875...,Jonathan Carpenter,600,800
2,3,train_val2018/Aves/2820/22c733d9199c35d2578232...,Marion Zöller,600,800
3,3,train_val2018/Aves/2757/e5641ce8fd794bf149d49f...,phylocode,640,800
4,1,train_val2018/Mammalia/4219/60c8365f55c2f4e790...,J. N. Stuart,533,800
5,3,train_val2018/Aves/3942/872fef47a85beb90efc301...,Alfonso Gutiérrez Aldana,533,800
...,...,...,...,...,...
437509,3,train_val2018/Plantae/6586/a5b14994c80cdfa5a49...,Kathleen Sayce,800,600
437510,3,train_val2018/Plantae/7444/bbfb68b25f5debd876b...,Sam Kieschnick,800,600
437511,3,train_val2018/Aves/3555/68e5cabe14b6d767b25824...,Luis Guillermo,452,800
437512,3,train_val2018/Plantae/7245/df6b20139f4a14e31e9...,kylejones,800,597


Unnamed: 0_level_0,image_id,category_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,7477
2,2,2820
3,3,2757
4,4,4219
5,5,3942
...,...,...
437509,437509,6586
437510,437510,7444
437511,437511,3555
437512,437512,7245


Unnamed: 0,kingdom,name,family,supercategory,phylum,id,order,genus,class
0,Animalia,Hermodice carunculata,Amphinomidae,Animalia,Annelida,0,Phyllodocida,Hermodice,Polychaeta
1,Animalia,Phragmatopoma californica,Sabellariidae,Animalia,Annelida,1,Sabellida,Phragmatopoma,Polychaeta
2,Animalia,Eudistylia vancouveri,Sabellidae,Animalia,Annelida,2,Sabellida,Eudistylia,Polychaeta
3,Animalia,Galeolaria hystrix,Serpulidae,Animalia,Annelida,3,Sabellida,Galeolaria,Polychaeta
4,Animalia,Serpula columbiana,Serpulidae,Animalia,Annelida,4,Sabellida,Serpula,Polychaeta
...,...,...,...,...,...,...,...,...,...
8137,Plantae,Neogastroclonium subarticulatum,Champiaceae,Plantae,Rhodophyta,8137,Rhodymeniales,Neogastroclonium,Florideophyceae
8138,Protozoa,Lycogala epidendrum,Tubiferaceae,Protozoa,Mycetozoa,8138,Liceales,Lycogala,Myxomycetes
8139,Protozoa,Fuligo septica,Physaraceae,Protozoa,Mycetozoa,8139,Physarales,Fuligo,Myxomycetes
8140,Protozoa,Leocarpus fragilis,Physaraceae,Protozoa,Mycetozoa,8140,Physarales,Leocarpus,Myxomycetes


In [177]:
val = join_dataframes(vimages2018, vanno2018, cats, vloc)
val['set'] = "validation"

train = join_dataframes(timages2018, tanno2018, cats, tloc)
train['set'] = 'train'
df = pd.concat([train, val]).reset_index(drop=True)

df['name'] = df['name'].apply(lambda x: '_'.join(x.split(' ')))
df['new_dirs'] = df.apply(lambda x: f"Fungi_{x['phylum']}_{x['class']}_{x['order']}_{x['family']}_{x['name']}", axis=1)
df['new_paths'] = df['new_dirs'] + '/' + df['file_name'].str.split('/').str[-1]

# target = data / '2018' / 'train_val'
# target.mkdir(exist_ok=True)
# for group in df.groupby('new_dirs'):
#     target_dir = target / group[0]
#     target_dir.mkdir(exist_ok=True)
#     for file in group[1]['file_name']:
#         file = data / '2018' / Path(file)
#         parent_dir = file.parent
#         file.rename(target_dir / group[1]['new_paths'])
#     parent_dir.rmdir()

df.loc[:, 'file_name'] = df['file_name'].str.split('/').str[-1]
df = df.drop(['category_id', 'new_paths'], axis=1)
df['specific_epithet'] = df['name'].str.split(' ').str[-1]
df['image_dir_name'] = "Fungi_" + df['phylum'] + '_' + df['class'] + '_' + df['order'] + '_' + df['family'] + '_' + df['genus'] + '_' + df['specific_epithet']
df = df.rename(columns={'lon': 'longitude', 'lat': 'latitude', 'loc_uncert': 'location_uncertainty'})
# df.to_csv(data / '2018' / 'train_val.csv', index=False)

In [180]:
# This is just fixing a screwed up made when joining the 2018 and 2021 datasets..
all = Path.cwd().parent / 'data' / 'train'
for f in all.iterdir():
    if f.is_file():
        genus = df[df['file_name'] == f.name]['new_dirs'].values
        if len(genus) > 0:
            genus = genus[0]
            if not (all / genus).exists():
                (all / genus).mkdir(exist_ok=True)
            f.rename(all / genus / f.name)
        else:
            print(f.name)

## 2021 Dataset

In [138]:
display(timages2021)
display(tanno2021)
display(tcat2021)

Unnamed: 0_level_0,width,height,file_name,license,rights_holder,date,latitude,longitude,location_uncertainty
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,500,500,train/02912_Animalia_Chordata_Actinopterygii_S...,0,Ken-ichi Ueda,2010-07-14 20:19:00+00:00,43.83486,-71.22231,77.0
1,500,333,train/04831_Animalia_Chordata_Mammalia_Rodenti...,0,Michelle S. Koo,2010-07-06 22:17:00+00:00,38.86995,-120.19326,
2,500,375,train/05015_Animalia_Chordata_Reptilia_Squamat...,1,105615097470186309865,2009-05-04 00:00:00+00:00,35.14218,-116.10415,
3,500,375,train/05163_Animalia_Chordata_Reptilia_Testudi...,1,biosam,2009-05-04 00:00:00+00:00,35.09829,-116.02979,28734.0
4,500,375,train/04983_Animalia_Chordata_Reptilia_Squamat...,1,biosam,2009-05-05 00:00:00+00:00,35.01099,-115.47336,
...,...,...,...,...,...,...,...,...,...
2686838,500,281,train/08843_Plantae_Tracheophyta_Magnoliopsida...,1,Marilyn falcón Llacctas,2017-01-07 23:08:00+00:00,-12.16751,-76.92011,6578.0
2686839,500,335,train/04661_Animalia_Chordata_Mammalia_Artioda...,1,Glenn Caspers,2018-09-23 14:47:00+00:00,37.70054,-123.01332,2649.0
2686840,500,354,train/02617_Animalia_Arthropoda_Insecta_Orthop...,1,Greg Holland,2011-09-05 07:20:00+00:00,40.04478,-105.18851,244.0
2686841,500,335,train/04438_Animalia_Chordata_Aves_Piciformes_...,1,Glenn Caspers,2018-09-16 13:26:00+00:00,37.86510,-119.53833,9798.0


Unnamed: 0_level_0,image_id,category_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,2912
1,1,4831
2,2,5015
3,3,5163
4,4,4983
...,...,...
2686838,2686838,8843
2686839,2686839,4661
2686840,2686840,2617
2686841,2686841,4438


Unnamed: 0_level_0,name,common_name,supercategory,kingdom,phylum,class,order,family,genus,specific_epithet,image_dir_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Lumbricus terrestris,Common Earthworm,Animalia,Animalia,Annelida,Clitellata,Haplotaxida,Lumbricidae,Lumbricus,terrestris,00000_Animalia_Annelida_Clitellata_Haplotaxida...
1,Sabella spallanzanii,Mediterranean Fanworm,Animalia,Animalia,Annelida,Polychaeta,Sabellida,Sabellidae,Sabella,spallanzanii,00001_Animalia_Annelida_Polychaeta_Sabellida_S...
2,Serpula columbiana,Serpula columbiana,Animalia,Animalia,Annelida,Polychaeta,Sabellida,Serpulidae,Serpula,columbiana,00002_Animalia_Annelida_Polychaeta_Sabellida_S...
3,Spirobranchus cariniferus,Blue Tube Worm,Animalia,Animalia,Annelida,Polychaeta,Sabellida,Serpulidae,Spirobranchus,cariniferus,00003_Animalia_Annelida_Polychaeta_Sabellida_S...
4,Eratigena duellica,Giant House Spider,Arachnids,Animalia,Arthropoda,Arachnida,Araneae,Agelenidae,Eratigena,duellica,00004_Animalia_Arthropoda_Arachnida_Araneae_Ag...
...,...,...,...,...,...,...,...,...,...,...,...
9995,Psilotum nudum,skeleton fork fern,Plants,Plantae,Tracheophyta,Polypodiopsida,Psilotales,Psilotaceae,Psilotum,nudum,09995_Plantae_Tracheophyta_Polypodiopsida_Psil...
9996,Tmesipteris elongata,Tmesipteris elongata,Plants,Plantae,Tracheophyta,Polypodiopsida,Psilotales,Psilotaceae,Tmesipteris,elongata,09996_Plantae_Tracheophyta_Polypodiopsida_Psil...
9997,Azolla filiculoides,water fern,Plants,Plantae,Tracheophyta,Polypodiopsida,Salviniales,Salviniaceae,Azolla,filiculoides,09997_Plantae_Tracheophyta_Polypodiopsida_Salv...
9998,Salvinia minima,water spangles,Plants,Plantae,Tracheophyta,Polypodiopsida,Salviniales,Salviniaceae,Salvinia,minima,09998_Plantae_Tracheophyta_Polypodiopsida_Salv...


In [152]:
train2 = join_dataframes(timages2021, tanno2021, tcat2021)
val2 = join_dataframes(vimages2021, vanno2021, vcat2021)
train2['set'] = 'train'
val2['set'] = 'validation'
df2 = pd.concat([train2, val2]).reset_index(drop=True)

In [110]:
tpath = data / '2021' / 'train'
for p in tpath.iterdir():
    new_name = '_'.join(p.name.split('_')[1:])
    new_dir = tpath / new_name
    new_dir.mkdir()
    for f in p.iterdir():
        f.rename(new_dir / f.name)
    p.rmdir()

In [200]:
df2['image_dir_name'] = df2['image_dir_name'].apply(lambda x: '_'.join(x.split('_')[1:]))
df2 = df2.drop('category_id', axis=1)
df2['dataset'] = '2021'
df2['file_name'] = df2['file_name'].str.split('/').str[-1] 

In [209]:
df = pd.concat([df, df2], ignore_index=True)
df['file_path'] = 'Mushroom-Classifier/data/train/' + df['image_dir_name'] + '/' + df['file_name']
df['gcs_path'] = 'gs://mush-img-repo/train/ ' + df['image_dir_name'] + '/' + df['file_name']
df['class_id'] = df.groupby('name').ngroup()
df.to_csv(root / 'data' / 'train_val.csv', index=False)