In [61]:
import os
from s3fs.core import S3FileSystem
from xml.etree import ElementTree as ET
import pandas as pd


os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'

fs = S3FileSystem(anon=False)
bucket = 'treetracker-training-images'
datadir = os.path.join(bucket, "PlantCLEF_2016")


In [62]:

def parse_xml_s3path(s3path):
    '''
    Parse and return a PlantCLEF XML file given a valid s3path
    '''
    if os.path.splitext(s3path)[1] == "xml": 
        with fs.open(s3path) as file:
            tree = ET.parse(file)
            species = tree.find("Species").text
            family = tree.find("Family").text
            genus = tree.find("Genus").text
            class_id = tree.find("ClassId").text
            obs_id = tree.find("ObservationId").text
            date = tree.find("Date").text
            role = tree.find("LearnTag").text
            latitude = tree.find("Latitude").text
            longitude = tree.find("Longitude").text
            return (role, obs_id, class_id, date, family, genus, species, latitude, longitude)
    else:
        pass

def generate_plantclef_csv():
    linkings = {}
    for f, e, d in fs.walk(datadir):
        for path in d:
            if os.path.splitext(path)[1] == ".jpg": # Image file
                base = os.path.splitext(path)[0]
                print (base , " being processed")
                fullpath = os.path.join(f, base)
                xmlpath = fullpath + ".xml"
                if not fs.exists(xmlpath):
                    print ("File ", xmlpath , " does not exist!")
                linkings [base] = parse_xml_s3path(xmlpath)

    linkings = pd.DataFrame(linkings, columns=["role", "obs_id", "class_id", "date", "family", "genus", "species", "latitude", "longitude"])
    linkings.to_csv("s3://" + datadir + "/plantclef.csv")
                
                

            
            
    
    

In [None]:
generate_plantclef_csv()

1  being processed
10  being processed
1000  being processed
10000  being processed
100001  being processed
100003  being processed
100004  being processed
100005  being processed
100007  being processed
100009  being processed
10001  being processed
100011  being processed
100012  being processed
100014  being processed
100016  being processed
100017  being processed
100018  being processed
100019  being processed
10002  being processed
100022  being processed
100023  being processed
100025  being processed
100026  being processed
100028  being processed
10003  being processed
100030  being processed
100031  being processed
100032  being processed
100033  being processed
100034  being processed
100037  being processed
100038  being processed
100039  being processed
10004  being processed
100040  being processed
100041  being processed
100042  being processed
100043  being processed
100044  being processed
100046  being processed
100047  being processed
100048  being processed
100049  

In [60]:
df = pd.read_csv("s3://" + datadir + "/plantclef.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,role,obs_id,class_id,date,family,genus,species,latitude,longitude
