In [7]:
import cloud.serialization.cloudpickle as pickle

DEBUG:Cloud:Log file (/home/amir/.picloud/cloud.log) opened


# Sort the taxonomy in a dictionary

In [33]:
# This depends on the order of the lines
# This script needs the unique format of taxonomy.csv
# An alternative for a script aimed at a more sensible format 
# follows

taxonomy_file_lines = open('data/taxonomy.csv','r').readlines()

taxonomy = {}

current_phylum=None
current_group = None
current_genus = None

for line in [l for l in taxonomy_file_lines if not 'group' in l]:
    part = line.rstrip('\n').split('\t')
    
    # first value is phylum.
    # It applies to all subsequent lines 
    # until there is a new line with a first value
    if part[0]:
        current_phylum = part[0]
        taxonomy[current_phylum] = {}
        
    # second value is 'group' (order, super family or family)
    # it applies to all subsequent lines
    # until there is a new line with second value
    elif part[1]:
        current_group = part[1]
        taxonomy[current_phylum][current_group] = {}
           
    # third value is genus if there is no fourth value
    # it applies to all subsequent lines 
    # until there is a new line with third value and no forth value   
    elif part[2] and not part[3]:
        current_genus = part[2]
        if current_group in taxonomy[current_phylum].keys():
            taxonomy[current_phylum][current_group][current_genus] = []
        else:
            current_group = 'unspecified'
            if not current_group in taxonomy[current_phylum].keys():
                taxonomy[current_phylum][current_group] = {}
            taxonomy[current_phylum][current_group][current_genus] = []
    
    # If there are both third and forth values, it is a species
    # it belongs to the latest identifed genus
    elif part[2] and part[3]:
        sp = "%s %s"%(part[2], part[3])
        taxonomy[current_phylum][current_group][current_genus].append(sp)
        
# pickle file
pickle_file_name = 'data/taxonomy.pkl'
output = open(pickle_file_name,'wb')
pickle.dump(taxonomy, output)
output.close()

# text file
tax = open('data/taxonomy_dict.txt','wt')
tax.write(str(taxonomy))
tax.close()

## Alternative - Sort the taxonomy in a dictionary
I did not use this script, it is aimed at a more sensible input format than the one I had above. It expects one line per species, containing four levels of taxonomy and blanks when the level is skipped. Exampe:

<pre>
Amphipoda<font color=red>\tCorophiidae\t</font>Corophium\tCorophium curvispinum\n
Amphipoda<font color=red>\tCorophiidae\t</font>Corophium\tCorophium insidiosum\n
Crangonyctidae<font color=red>\t\t</font>Crangonyx\tCrangonyx pseudogracilis\n
Crangonyctidae<font color=red>\t\t</font>Crangonyx\tCrangonyx subterraneus\n
</pre>

In [None]:
# This is just an example, I never ran this, may need debugging
# or may need to be costumized

taxonomy_file_lines = open('some_file.tsv','r').readlines()

taxonomy = {}

for line in taxonomy_file_lines:
    
    parts = line.rstrip().split('\t')
    hi_group = parts[0]
    lo_group = parts[1]
    genus = parts[2]
    species = parts[3]
    
    if not hi_group in taxonomy.keys():
        taxonomy[hi_group] = {}
    if lo_group and not lo_group in taxonomy[hi_group].keys():
        taxonomy[hi_group][lo_group] = {}
    if lo_group and not genus in taxonomy[hi_group][lo_group].keys():
        taxonomy[hi_group][lo_group][genus] = []
    if not lo_group and not genus in taxonomy[hi_group].keys():
        taxonomy[hi_group][genus] = []
    if lo_group:
        taxonomy[hi_group][lo_group][genus].append(species)
    else:
        taxonomy[hi_group][genus].append(species)

In [34]:
taxonomy

{'Acanthocephala': {},
 'Amphipoda': {'Corophiidae': {'Corophium': ['Corophium curvispinum',
    'Corophium insidiosum',
    'Corophium lacustre',
    'Corophium multisetosum',
    'Corophium volutator']},
  'Crangonyctidae': {'Crangonyx': ['Crangonyx pseudogracilis',
    'Crangonyx subterraneus']},
  'Gammaridae': {'Dikerogammarus': ['Dikerogammarus villosus'],
   'Echinogammarus': ['Echinogammarus berilloni'],
   'Gammarus': ['Gammarus chevreuxi',
    'Gammarus duebeni',
    'Gammarus lacustris',
    'Gammarus locusta',
    'Gammarus oceanicus',
    'Gammarus pulex',
    'Gammarus salinus',
    'Gammarus tigrinus',
    'Gammarus zaddachi']},
  'Gammaridae (including Crangonyctidae and Niphargidae)': {},
  'Gammaroidea': {},
  'Niphargidae': {'Microniphargus': ['Microniphargus leruthi'],
   'Niphargus': ['Niphargus aquilex',
    'Niphargus fontanus',
    'Niphargus kochianus',
    'Niphargus wexfordensis',
    'Niphargus glenniei']},
  'Talitridae': {'Orchestia': ['Orchestia cavimana'

In [35]:
# Reversing the dict is more usefull
# ie, get the taxonomy of a species


reverse_taxonomy = {}

for phylum in taxonomy:
    for group in taxonomy[phylum]:
        for genus in taxonomy[phylum][group]:
            for species in taxonomy[phylum][group][genus]:
                reverse_taxonomy[species] = [phylum, group, genus]
                
pickle_file_name = 'data/reverse_taxonomy.pkl'
output = open(pickle_file_name,'wb')
pickle.dump(reverse_taxonomy, output)
output.close()

tax = open('data/reverse_taxonomy_dict.txt','wt')
tax.write(str(reverse_taxonomy))
tax.close()

In [36]:
reverse_taxonomy

{'Ablabesmyia longistyla': ['Diptera', 'Tanypodinae', 'Ablabesmyia'],
 'Ablabesmyia monilis': ['Diptera', 'Tanypodinae', 'Ablabesmyia'],
 'Ablabesmyia phatta': ['Diptera', 'Tanypodinae', 'Ablabesmyia'],
 'Acamptocladius reissi': ['Diptera', 'Orthocladiinae', 'Acamptocladius'],
 'Acamptocladius submontanus': ['Diptera', 'Orthocladiinae', 'Acamptocladius'],
 'Acanthocnema (Acanthocnema) nigrimana': ['Diptera',
  'Scathophagidae',
  'Acanthocnema (Acanthocnema)'],
 'Acanthocnema (Clinoceroides) glaucescens': ['Diptera',
  'Scathophagidae',
  'Acanthocnema (Clinoceroides)'],
 'Acanthocyclops robustus': ['Copepoda', 'Cyclopidae', 'Acanthocyclops'],
 'Acanthocyclops sensitivus': ['Copepoda', 'Cyclopidae', 'Acanthocyclops'],
 'Acanthocyclops venustus': ['Copepoda', 'Cyclopidae', 'Acanthocyclops'],
 'Acanthocyclops vernalis': ['Copepoda', 'Cyclopidae', 'Acanthocyclops'],
 'Acantholeberis  curvirostris': ['Cladocera',
  'Acantholeberidae',
  'Acantholeberis '],
 'Acartia bifilosa': ['Copepoda',

In [37]:
# test:
# the genus in the key (from the binomial name)
# should fit the genus form the value (third item in the list of phylum, group, genus).

for species in reverse_taxonomy:
    if not species.split()[0] == reverse_taxonomy[species][2].split()[0]:
        print species, reverse_taxonomy[species]
        
# Nothing printed, success.