https://www.biorxiv.org/content/10.1101/744193v3.full.pdf

In [1]:
from itertools import chain
import os
import pandas as pd
import re

Open the list of all submitted files under `/uod/idr/filesets/idr0099-jain-beetlelightsheet/20201001-ftp`. Set rootdir to the relative path to `/uod/idr/filesets/idr0099-jain-beetlelightsheet` so that we can open the symlinks even when the `/uod/idr` symlink is not present, e.g. on a HPC cluster.

In [2]:
# Assumes pattern files are named
# /uod/idr/filesets/idr0099-jain-beetlelightsheet/DATE-patterns/dataset/*.pattern
rootdir = '../../'
with open('idr0099-jain-beetlelightsheet-20201001-ftp.filelist.txt') as f:
    filelist = f.read().splitlines()

Get the list of unique parent directories

In [3]:
figuredirs = sorted(set(os.path.dirname(f) for f in filelist))
figuredirs

['20201001-ftp/Akanksha_Jain_16-4-15_LifeAct-eGFP',
 '20201001-ftp/Akanksha_Jain_22-06-16_Tc-Squash-eGFP',
 '20201001-ftp/Akanksha_Jain_4-3-15_nGFP',
 '20201001-ftp/Akanksha_Jain_8-6-19_ZenKD_GAP43-eYFP',
 '20201001-ftp/Akanksha_Jain_9-3-15_Histone-eGFP']

In [4]:
filelists = []
for figdir in figuredirs:
    filelists.append([os.path.basename(f) for f in filelist if f.startswith(f'{figdir}')])

Split filename into tokens, check for variable components of the filename

In [5]:
def tokenise(s):
    return re.split('[_,\.]', s)

for i, filelist in enumerate(filelists):
    print(f'{figuredirs[i]} ({filelist[0]} ...)')
    df = pd.DataFrame([tokenise(f) for f in filelist])
    for name, col in df.iteritems():
        unique = col.unique()
        if len(unique) > 1:
            print(f'  {name} {len(unique)}/{len(col)}, {unique[:5]}...')

20201001-ftp/Akanksha_Jain_16-4-15_LifeAct-eGFP (img_TL0.tif ...)
  1 798/798, ['TL0' 'TL125' 'TL1' 'TL10' 'TL127']...
20201001-ftp/Akanksha_Jain_22-06-16_Tc-Squash-eGFP (TP0_Ch0_Ill0_Ang1,2,3,4,5.tif ...)
  0 212/212, ['TP0' 'TP133' 'TP100' 'TP135' 'TP101']...
20201001-ftp/Akanksha_Jain_4-3-15_nGFP (TP0_Chgreen_Ill0_Ang0,1,2.tif ...)
  0 527/527, ['TP0' 'TP11' 'TP100' 'TP120' 'TP101']...
20201001-ftp/Akanksha_Jain_8-6-19_ZenKD_GAP43-eYFP (fused_tp_0_ch_0.tif ...)
  2 142/142, ['0' '100' '101' '124' '102']...
20201001-ftp/Akanksha_Jain_9-3-15_Histone-eGFP (TP0_Chgreen_Ill0_Ang0,1,2.tif ...)
  0 539/539, ['TP0' 'TP122' 'TP100' 'TP123' 'TP101']...


Based on this create pattern files for varying `T`: `0..N-1` and symlink the original files into the directory used for the pattern file. From https://docs.openmicroscopy.org/bio-formats/5.9.0/formats/pattern-file.html `8-6-19_ZenKD_GAP43-eYFP` which contains `tp_` will need to be renamed to remove the `_`:

In [6]:
re_pat = re.compile(r'([Tt][LPp])_?(\d+)')
for i, filelist in enumerate(filelists):
    f0 = filelist[0]
    d = figuredirs[i][27:]
    m = re_pat.search(f0)
    assert m
    pattern = re_pat.sub(f'\\1<0-{len(filelist) - 1}>', f0).lower()
    pattern_file = re_pat.sub(r'\1', f0) + '.pattern'
    print(f0, pattern, pattern_file)
    os.mkdir(d)
    with open(os.path.join(d, pattern_file), 'w') as f:
        f.write(pattern + '\n')
    for f in filelist:
        src = os.path.join(rootdir, figuredirs[i], f)
        dst = os.path.join(d, re_pat.sub(r'\1\2', f).lower())
        os.symlink(src, dst)


img_TL0.tif img_tl<0-797>.tif img_TL.tif.pattern
TP0_Ch0_Ill0_Ang1,2,3,4,5.tif tp<0-211>_ch0_ill0_ang1,2,3,4,5.tif TP_Ch0_Ill0_Ang1,2,3,4,5.tif.pattern
TP0_Chgreen_Ill0_Ang0,1,2.tif tp<0-526>_chgreen_ill0_ang0,1,2.tif TP_Chgreen_Ill0_Ang0,1,2.tif.pattern
fused_tp_0_ch_0.tif fused_tp<0-141>_ch_0.tif fused_tp_ch_0.tif.pattern
TP0_Chgreen_Ill0_Ang0,1,2.tif tp<0-538>_chgreen_ill0_ang0,1,2.tif TP_Chgreen_Ill0_Ang0,1,2.tif.pattern


`22-06-16_Tc-Squash-eGFP/TP_Ch0_Ill0_Ang1,2,3,4,5.tif.pattern` needs a custom fix due to a missing file `Akanksha_Jain_22-06-16_Tc-Squash-eGFP/TP18_Ch0_Ill0_Ang1,2,3,4,5.tif`

In [7]:
with open('22-06-16_Tc-Squash-eGFP/TP_Ch0_Ill0_Ang1,2,3,4,5.tif.pattern', 'r+') as f:
    s = f.read()
    f.seek(0)
    ns = ','.join(str(n) for n in chain(range(18), range(19, 213)))
    f.write(s.replace('<0-211>', f'<{ns}>'))
    f.truncate()