# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Sad-old-things" data-toc-modified-id="Sad-old-things-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Sad old things</a></div>

In [1]:
import os
import pandas as pd
import re

In [2]:
LOCAL_PATH = '/Users/jacob/Downloads/hackathon/'

In [3]:
def list_file_paths(path, extension='.tif'):
    """List the file paths for every file ending with `extension` in directory `path`"""
    file_paths = []
    
    for (dir_path, _, file_names) in os.walk(path):
        
        relative_path = dir_path.replace(path, '')
        file_path = [os.path.join(relative_path, file_name) for file_name in file_names
                     if file_name.endswith(extension)]

        file_paths.extend(file_path)

    return file_paths

In [4]:
pd.DataFrame(
    list(map(lambda x: '/'.join(x.split('/')[:-1]), 
             list_file_paths(LOCAL_PATH + 'Streamline process/measured/') +
             list_file_paths(LOCAL_PATH + 'Search engine/test/')
            ))).drop_duplicates().to_csv('folder_names.csv', index=False)

In [5]:
folder_metadata = pd.read_csv('data/folder_taxa_specimen_tissue.csv')

In [6]:
def merge_metadata(path):
    d = pd.DataFrame(list_file_paths(LOCAL_PATH + path), columns=['path'])
    d['folder'] = d.path.apply(lambda x: '/'.join(x.split('/')[:-1]))
    d.path = path + d.path
    d = d.merge(folder_metadata, on='folder')
    d.drop('folder', 1, inplace=True)
    return d

In [7]:
pd.concat([merge_metadata('Search engine/test/'), merge_metadata('Streamline process/measured/')])\
    .to_csv('data/image_database.csv', index=False)

# Sad old things

In [2]:
def remove_specimen_ids(row):
    """
    The taxa and tissue columns both may have extra copies of the specimen id. We remove these.
    """
    
    def replace_specimen_id(s):
        return s.replace(specimen_id_space, '').replace(specimen_id_no_space, '')

    if pd.notnull(row['specimen_id_taxa']):
        row['taxa'] = row['taxa'].replace(row['specimen_id_taxa'], '')

    if pd.notnull(row['tissue']) and pd.notnull(row['specimen_id_tissue']):
        row['tissue'] = row['tissue'].replace(row['specimen_id_tissue'], '')
        
    return row

In [3]:
def find_consensus_specimen_id(row):
    
    taxa_isnull = bool(row['specimen_id_taxa'])
    tissue_isnull = bool(row['specimen_id_tissue'])
    
    if tissue_isnull and taxa_isnull:
        return ''
    
    elif not tissue_isnull and not taxa_isnull:
        if row['specimen_id_taxa'] == row['specimen_id_tissue']:
            return row['specimen_id_tissue']
        else:
            print(row)
            return row['specimen_id_tissue'] + '-' + row['specimen_id_taxa']
        
    elif tissue_isnull:
        return row['specimen_id_taxa']
        
    elif taxa_isnull:
        return row['specimen_id_tissue']

In [53]:
def extract_image_metadata(local_path, intermediate_path):
    """
    local_path: Path unique to your machine.
    intermediate_path: Path  to concatenate to local path to get to the files labeleled with the 
        taxa (genus/species) and (sometimes) specimen id
    """
    
    file_paths = list_file_paths(local_path + intermediate_path)
    d = pd.DataFrame(file_paths, columns=["path"])
    
    d['specimen_id_taxa'] = d.path.str.extract('([A-Z]+ ?\d+)/', expand=False)
    d['specimen_id_tissue'] = d.path.str.extract('/[A-Z]?[a-z ]*([A-Z]+ ?\d*)/', expand=False)
    d['taxa'] = d.path.str.extract('(.*?)[_!@]?/', expand=False)
    d['tissue'] = d.path.str.extract('/(.*)/', expand=False)
    
    # after regexing on the more standard paths, prepend the intermediate path
#     d.path = intermediate_path + d.path
    
    return d

In [50]:
def main(path):
    d = extract_image_metadata(LOCAL_PATH, path)
    d = d.apply(remove_specimen_ids, axis=1)
    
    # After using them to clean up the taxa and tissue, normalize the specimen_id_taxa and 
    # specimen_id_tissue
    d['specimen_id_taxa'] = d.specimen_id_taxa.fillna('').str.replace(' ', '')
    d['specimen_id_tissue'] = d.specimen_id_tissue.fillna('').str.replace(' ', '')
    
    d['specimen_id'] = d.apply(find_consensus_specimen_id, axis=1)
    
    d.taxa = d.taxa.str.lower().str.replace('_', '')
    d.tissue = d.tissue.str.lower().str.strip()
    return d