In [None]:
import os
from s3fs.core import S3FileSystem
from xml.etree import ElementTree as ET
import pandas as pd


os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'

fs = S3FileSystem(anon=False)
bucket = 'treetracker-training-images'
datadir = os.path.join(bucket, "PlantCLEF_2016")


In [None]:

def parse_xml_s3path(s3path):
    '''
    Parse and return a PlantCLEF XML file given a valid s3path
    '''
    if os.path.splitext(s3path)[1] == ".xml": 
        with fs.open(s3path) as file:
            tree = ET.parse(file)
            species = tree.find("Species").text
            family = tree.find("Family").text
            genus = tree.find("Genus").text
            class_id = tree.find("ClassId").text
            obs_id = tree.find("ObservationId").text
            date = tree.find("Date").text
            role = tree.find("LearnTag").text
            latitude = tree.find("Latitude").text
            longitude = tree.find("Longitude").text
            return (role, obs_id, class_id, date, family, genus, species, latitude, longitude)
    else:
        pass

def generate_plantclef_csv(write_csv, batch_size=100):
    '''
    Collects metadata from PlantCLEF 2016 and creates csv file
    '''
    linkings_dict = {}
    if fs.exists(write_csv):
        already_done = dict.fromkeys(pd.read_csv(write_csv).index.values)
    else:
        already_done = []
    for f, e, d in fs.walk(datadir):
        for path in d:
            if os.path.splitext(path)[1] == ".jpg": # Image file
                base = os.path.splitext(path)[0]
                fullpath = os.path.join(f, base)
                xmlpath = fullpath + ".xml"
                count += 1
                if count % batch_size == 0 or :
                    print (count, " done")
                    linkings = pd.DataFrame.from_dict(linkings_dict, orient="index")
                    if count <= batch_size: 
                        linkings.columns = ["role", "obs_id", "class_id", "date", "family", "genus", "species", "latitude", "longitude"]
                        linkings.to_csv(write_csv, mode='w')
                    else:
                        print ("%d done"%(count))
                        linkings.to_csv(write_csv, mode='a' header=False) # append batch-wise
                    linkings_dict.clear()
                if not fs.exists(xmlpath):
                    print ("File ", xmlpath , " does not exist!")
                if already_done.get(base, 0) == 0: # makes sure we're not processing something that already has been processed
                    linkings_dict [base] = parse_xml_s3path(xmlpath)
                
    return pd.read_csv(write_csv)

            
            
    
    

In [None]:
generate_plantclef_csv(write_csv="s3://" + datadir + "/plantclef.csv")

In [17]:
df = pd.read_csv("s3://" + datadir + "/plantclef.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,role,obs_id,class_id,date,family,genus,species,latitude,longitude
0,1,Train,21782,5810,2013-3-21,Amaryllidaceae,Narcissus,Narcissus dubius Gouan,42.98235,3.01486
1,10,Train,11786,6448,2014-8-22,Orchidaceae,Epipactis,Epipactis atrorubens (Hoffm.) Besser,45.34137,5.81565
2,1000,Train,40480,30040,2011-4-8,Fabaceae,Cercis,Cercis siliquastrum L.,,
3,10000,Train,14235,3529,2013-9-12,Lamiaceae,Clinopodium,Clinopodium nepeta (L.) Kuntze,,
4,100001,Train,33815,4736,2013-6-5,Rosaceae,Cydonia,Cydonia oblonga Mill.,48.84059,2.36158
