In [286]:
import os
import pandas as pd
from pathlib import Path
import re
from shutil import copyfile
import xml.etree.ElementTree as ET

In [115]:
pd.options.display.max_colwidth =100

In [7]:
def list_file_paths(path, extension='.tif'):
    """List the file paths for every file ending with `extension` in directory `path`"""
    file_paths = []
    
    for (dir_path, _, file_names) in os.walk(path):
        
        file_path = [os.path.join(dir_path, file_name) for file_name in file_names
                     if file_name.endswith(extension)]

        file_paths.extend(file_path)

    return file_paths

In [3]:
LOCAL_PATH = '/Users/jacob/Downloads/hackathon/'

In [5]:
canonical_folder_paths = ['Streamline process/measured/', 'Search engine/test/']

# Add taxa, specimen, tissue 

Create a list of folder names,which contain the taxa, specimen and tissue identifiers.

In [202]:
def extract_folders():
    folders = []

    for cfp in canonical_folder_paths:
        for f in list_file_paths(LOCAL_PATH + cfp):
            folders.append(str(Path(f.replace(LOCAL_PATH + cfp, '')).parent))

    pd.DataFrame(folders, columns=['folder']).drop_duplicates().to_csv('data/folders.csv', index=False)
    
extract_folders()

After manually extracting the labels, reimport those labels.

In [29]:
folder_metadata = pd.read_csv('data/folder_taxa_specimen_tissue.csv')

In [83]:
def merge_metadata(path):
    d = pd.DataFrame(list_file_paths(LOCAL_PATH + path), columns=['path'])
    d['path'] = d.path.str.replace(LOCAL_PATH, '')
    d['folder'] = d.path.str.replace(path, '').apply(Path).apply(lambda x: x.parent).astype(str)

    d = d.merge(folder_metadata, on='folder')
    d.drop('folder', 1, inplace=True)
    return d

In [88]:
pd.concat(list(map(merge_metadata, canonical_folder_paths))).to_csv('image_database.csv', index=False)

# Add has_anx, has_cal_xml, has_eax, has_lmd, has_sbx

In [175]:
d = pd.read_csv('image_database.csv')

In [181]:
def generate_metadata_path(row, extension):
    # Some images have a number of the form " (1)". The metadata files are found by nontrivially 
    # appending extensions in this case.
    number = re.search('( \(\d+\)).tif', row['path'])
    if number:
        number = number.group(1)
    else:
        number = ''
    
    # Add the number in the appropriate location in the extension
    extension = extension.format(number)
    
    metadata_path = row['path'].replace(number + '.tif', '.tif') + extension
    
    if metadata_path in all_file_paths:
        return metadata_path
    else:
        return ''

In [177]:
def extract_all_file_paths():
    all_file_paths = []
    for cfp in canonical_folder_paths:

        all_file_paths.extend([path.replace(LOCAL_PATH, '') for \
                               path in list_file_paths(LOCAL_PATH + cfp, extension='')])

    return set(all_file_paths)

all_file_paths = extract_all_file_paths()

In [183]:
metadata_path_cols = []

for ext in ['{}.anx', '.cal{}.xml', '{}.eax', '{}.lmd', '{}.sbx']:    
    label = 'path' + ext.format('').replace('.', '_')
    metadata_path_cols.append(label)
    d[label] = d.apply(generate_metadata_path, extension=ext, axis=1)

In [199]:
d[d[metadata_path_cols].eq('').any(axis=1)]

Unnamed: 0,path,taxa,specimen,tissue,path_anx,path_cal_xml,path_eax,path_lmd,path_sbx
145,Streamline process/measured/AnthopleuraPAN49_/filament/image0014.tif,Anthopleura,PAN49,filament,Streamline process/measured/AnthopleuraPAN49_/filament/image0014.tif.anx,Streamline process/measured/AnthopleuraPAN49_/filament/image0014.tif.cal.xml,Streamline process/measured/AnthopleuraPAN49_/filament/image0014.tif.eax,,Streamline process/measured/AnthopleuraPAN49_/filament/image0014.tif.sbx
244,Streamline process/measured/AnthopleuraPAN59_/filament/image0009 (2).tif,Anthopleura,PAN59,filament,Streamline process/measured/AnthopleuraPAN59_/filament/image0009.tif (2).anx,Streamline process/measured/AnthopleuraPAN59_/filament/image0009.tif.cal (2).xml,Streamline process/measured/AnthopleuraPAN59_/filament/image0009.tif (2).eax,Streamline process/measured/AnthopleuraPAN59_/filament/image0009.tif (2).lmd,
436,Streamline process/measured/AnthopleuraPAN61_/filament/image0004 (3).tif,Anthopleura,PAN61,filament,Streamline process/measured/AnthopleuraPAN61_/filament/image0004.tif (3).anx,Streamline process/measured/AnthopleuraPAN61_/filament/image0004.tif.cal (3).xml,Streamline process/measured/AnthopleuraPAN61_/filament/image0004.tif (3).eax,Streamline process/measured/AnthopleuraPAN61_/filament/image0004.tif (3).lmd,
481,Streamline process/measured/cavusactis/filament/image0020.tif,Cavusactis,,filament,Streamline process/measured/cavusactis/filament/image0020.tif.anx,Streamline process/measured/cavusactis/filament/image0020.tif.cal.xml,Streamline process/measured/cavusactis/filament/image0020.tif.eax,,Streamline process/measured/cavusactis/filament/image0020.tif.sbx
489,Streamline process/measured/cavusactis/filament/image0027.tif,Cavusactis,,filament,Streamline process/measured/cavusactis/filament/image0027.tif.anx,Streamline process/measured/cavusactis/filament/image0027.tif.cal.xml,Streamline process/measured/cavusactis/filament/image0027.tif.eax,,Streamline process/measured/cavusactis/filament/image0027.tif.sbx
491,Streamline process/measured/cavusactis/filament/image0028 (3).tif,Cavusactis,,filament,Streamline process/measured/cavusactis/filament/image0028.tif (3).anx,Streamline process/measured/cavusactis/filament/image0028.tif.cal (3).xml,Streamline process/measured/cavusactis/filament/image0028.tif (3).eax,,Streamline process/measured/cavusactis/filament/image0028.tif (3).sbx
494,Streamline process/measured/cavusactis/filament/image0031.tif,Cavusactis,,filament,Streamline process/measured/cavusactis/filament/image0031.tif.anx,Streamline process/measured/cavusactis/filament/image0031.tif.cal.xml,Streamline process/measured/cavusactis/filament/image0031.tif.eax,,Streamline process/measured/cavusactis/filament/image0031.tif.sbx
497,Streamline process/measured/cavusactis/filament/image0038.tif,Cavusactis,,filament,Streamline process/measured/cavusactis/filament/image0038.tif.anx,Streamline process/measured/cavusactis/filament/image0038.tif.cal.xml,Streamline process/measured/cavusactis/filament/image0038.tif.eax,,Streamline process/measured/cavusactis/filament/image0038.tif.sbx
509,Streamline process/measured/cavusactis/filament/image0062.tif,Cavusactis,,filament,Streamline process/measured/cavusactis/filament/image0062.tif.anx,Streamline process/measured/cavusactis/filament/image0062.tif.cal.xml,Streamline process/measured/cavusactis/filament/image0062.tif.eax,,Streamline process/measured/cavusactis/filament/image0062.tif.sbx
512,Streamline process/measured/cavusactis/filament/image0071.tif,Cavusactis,,filament,Streamline process/measured/cavusactis/filament/image0071.tif.anx,Streamline process/measured/cavusactis/filament/image0071.tif.cal.xml,Streamline process/measured/cavusactis/filament/image0071.tif.eax,,Streamline process/measured/cavusactis/filament/image0071.tif.sbx


In [200]:
d.to_csv('image_database.csv', index=False)

# Add canonical name

In [267]:
d = pd.read_csv('image_database.csv')

In [268]:
d = d.merge(d.groupby(['taxa', 'specimen', 'tissue']).cumcount().to_frame(name='image_number'),
            left_index=True, right_index=True)

In [273]:
d['canonical_name'] = d.taxa.fillna('').str.replace(' ', '_') + '_' + \
                      d.specimen.fillna('').str.replace(' ', '_') + '_' + \
                      d.tissue.fillna('').str.replace(' ', '_') + '_' + \
                      d.image_number.astype(str)
d = d.drop('image_number', axis=1)

In [275]:
d.to_csv('image_database.csv', index=False)

In [258]:
def copy_and_rename_file(row):
    source = LOCAL_PATH + row['path']
    dest_folder = LOCAL_PATH + 'clean_images/' + row['taxa'].replace(' ', '_') 
    
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    
    dest_file = '/' + row['canonical_name'] + '.tif'

    os.symlink(source, dest_folder + dest_file)

In [259]:
d.apply(copy_and_rename_file, axis=1)

0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
        ... 
3689    None
3690    None
3691    None
3692    None
3693    None
3694    None
3695    None
3696    None
3697    None
3698    None
3699    None
3700    None
3701    None
3702    None
3703    None
3704    None
3705    None
3706    None
3707    None
3708    None
3709    None
3710    None
3711    None
3712    None
3713    None
3714    None
3715    None
3716    None
3717    None
3718    None
Length: 3719, dtype: object

# Extract length and width 

Divide the raw numbers by 10,000 to get to microns

In [276]:
d = pd.read_csv('image_database.csv')

In [316]:
def getCoordsFromPoint(el):
        x = None
        y = None
        for e in el:
            if e.tag == "PointIndex" and e.text == '2':
                return None
            if e.tag == "PositionX":
                x = e.text
            if e.tag == "PositionY":
                y = e.text

        return [x,y]

def getPointsFromLMD(filename):
    tree = ET.parse(filename)
    points = []
    for c in tree.getroot():
        for c2 in c:
            if c2.tag == "LMMeasurementItemPoints":
                coords = getCoordsFromPoint(c2)
                if coords:
                    points.append([float(i) for i in coords])
                    
    return np.array(points)

In [315]:
import numpy as np

In [308]:
from scipy.spatial.distance import euclidean

In [338]:
def extract_length_width(row):
    to_micrometers = 10000
    
    if pd.isnull(row['path_lmd']):
        return (float('nan'), float('nan'))
    
    points = getPointsFromLMD(LOCAL_PATH + row['path_lmd']) / to_micrometers
    
    if len(points) != 4:
        return (float('nan'), float('nan'))
    
    distances = [euclidean(points[0], points[1]), euclidean(points[2], points[3])]
    
    width = min(distances)
    length = max(distances)
    return pd.Series([length, width])

In [340]:
d = d.merge(d.apply(extract_length_width, axis=1), left_index=True, right_index=True)

In [345]:
d.columns = d.rename({0: 'length', 1: 'width'}, axis=1)

TypeError: rename() got an unexpected keyword argument "axis"

In [289]:
for row in d.iterrows():
    print()
    break

(0, path                      Streamline process/measured/Anthopleura_MEX13@/image0010.tif
taxa                                                                       Anthopleura
specimen                                                                         MEX13
tissue                                                                             NaN
path_anx              Streamline process/measured/Anthopleura_MEX13@/image0010.tif.anx
path_cal_xml      Streamline process/measured/Anthopleura_MEX13@/image0010.tif.cal.xml
path_eax              Streamline process/measured/Anthopleura_MEX13@/image0010.tif.eax
path_lmd              Streamline process/measured/Anthopleura_MEX13@/image0010.tif.lmd
path_sbx              Streamline process/measured/Anthopleura_MEX13@/image0010.tif.sbx
canonical_name                                                    Anthopleura_MEX13__0
Name: 0, dtype: object)
[[51449.2305769387, 78666.6252684309], [-113768.181901897, 156927.499809978], [-16666.7462926151, 1395