Datasets and Images are named slightly differently from the originals.
This checks the list of imported datasets/images matches those in the annotation CSVs, and updates the dataset/image names.

In [1]:
import json
import numpy as np
import pandas as pd
import re

In [2]:
with open('imagelist.json') as f:
    images = json.load(f)

In [3]:
def dataset_name(name):
    name1, name2 = name.rsplit('_', 1)
    if name.startswith('c0-DAPI'):
        return f'{name1} phenotype {name2} raw'
    else:
        return f'{name1} sbs {name2} raw'

def image_name(name):
    if name.endswith('.tif'):
        return name[:-4]
    return name

def processed_name(name):
    m = re.match(r'10X_c\d+-.+_([ABC]\d+_Tile-\d+)', name)
    assert m, name
    return f'10X_{m.group(1)}'

def processed_annotation_values(series):
    d = {}
    for f in (
            'Characteristics [Organism]',
            'Characteristics [Cell Line]',
            'Comment [Cell Line]'):
        v = series[f].unique()
        assert len(v) == 1, v
        d[f] = v[0]
    return d

experiment_anns = dict((
    exp, pd.read_csv(
        f'../experiment{exp}/idr0071-experiment{exp}-annotation.csv').convert_dtypes())
    for exp in 'ABCDEF')

for exp in 'ABCDEF':
    anns = experiment_anns[exp]
    dataset_diff = (set(dataset_name(n) for n in anns['Dataset Name'].unique())
            .symmetric_difference(
            images[f'idr0071-feldman-crisprko/experiment{exp}'].keys()))
    print(f'\nUnmatched datasets: {exp}')
    print('\t' + '\n\t'.join(sorted(dataset_diff)))
    for dname in anns['Dataset Name'].unique():
        images_in_ann = set(image_name(n) for n in anns.loc[anns['Dataset Name'] == dname]['Image Name'])
        images_in_dataset = images[f'idr0071-feldman-crisprko/experiment{exp}'][dataset_name(dname)]
        images_diff = images_in_ann.symmetric_difference(images_in_dataset)
        if images_diff:
            print(f'\nUnmatched images: {dname}')
            print('\t' + '\n\t'.join(sorted(images_diff)))


Unmatched datasets: A
	aligned A1 process
	aligned A2 process
	aligned A3 process
	aligned B1 process
	aligned B2 process
	aligned B3 process
	cells A1 process
	cells A2 process
	cells A3 process
	cells B1 process
	cells B2 process
	cells B3 process
	log A1 process
	log A2 process
	log A3 process
	log B1 process
	log B2 process
	log B3 process
	nuclei A1 process
	nuclei A2 process
	nuclei A3 process
	nuclei B1 process
	nuclei B2 process
	nuclei B3 process
	phenotype_aligned A1 process
	phenotype_aligned A2 process
	phenotype_aligned A3 process
	phenotype_aligned B1 process
	phenotype_aligned B2 process
	phenotype_aligned B3 process

Unmatched datasets: B
	aligned A1 process
	aligned A2 process
	aligned A3 process
	aligned B1 process
	aligned B2 process
	aligned B3 process
	cells A1 process
	cells A2 process
	cells A3 process
	cells B1 process
	cells B2 process
	cells B3 process
	log A1 process
	log A2 process
	log A3 process
	log B1 process
	log B2 process
	log B3 process
	nuclei A1 p

We need to modify the annotations CSV:
- Update the `Image Name` to match the name set in the import filelist
- Fix the `Cycle Number` (always `0` in the CSV) by extracting it from the image name
- Change the `Processed Data File` to match the file prefix (`<WELL>_Tile-<NUMBER>`) used to name the processed image files in the import filelist (`<WELL>_Tile-<NUMBER>.<aligned|cells|log|nuclei>`), experiments `A` `B` `C` `D` also have `phenotype_aligned` files
- Set the channel names for the processed images:
  - `aligned` and `log`: `DAPI:Cy3:A594:Cy5:Cy7`
  - `phenotype_aligned`: `DAPI:p65`
  - `cells`: `Cells`
  - `nuclei`: `Nuclei`

In [4]:
processed_channels = {
    'aligned': 'DAPI:Cy3:A594:Cy5:Cy7',
    'log': 'DAPI:Cy3:A594:Cy5:Cy7',
    'phenotype_aligned': 'DAPI:p65',
    'cells': 'Cells',
    'nuclei': 'Nuclei',
}

for exp in 'ABCDEF':
    print(f'Updating {exp}')
    anns = experiment_anns[exp].copy()
    anns['Dataset Name'] = anns['Dataset Name'].apply(lambda x: dataset_name(x))
    anns['Image Name'] = anns['Image Name'].apply(image_name)
    # Ensure column remains nullable since we're appending empty values for the processed images
    anns['Cycle Number'] = anns['Image Name'].apply(lambda f: int(f[5:6])).astype(pd.Int8Dtype())
    anns['Processed Data File'] = anns['Image Name'].apply(processed_name)

    toappend = []
    for p in anns['Processed Data File'].unique():
        selected = anns[anns['Processed Data File'] == p]
        d = processed_annotation_values(selected)
        well = p.split('_')[1]
        processed_types = ['aligned', 'cells', 'log', 'nuclei']
        if exp in 'ABCD':
            processed_types.append('phenotype_aligned')
            np.array_equal(anns.Channels.unique(), ['DAPI:p65', 'DAPI:Cy3:A594:Cy5:Cy7'])
        else:
            np.array_equal(anns.Channels.unique(), ['DAPI:Cy3:A594:Cy5:Cy7'])
        
        for processed_type in processed_types:
            dcopy = d.copy()
            dcopy['Dataset Name'] = f'{processed_type} {well} process'
            dcopy['Image Name'] = f'{p}.{processed_type}'
            dcopy['Channels'] = processed_channels[processed_type]
            toappend.append(dcopy)

    anns = anns.append(toappend)
    anns.to_csv(f'../experiment{exp}/idr0071-experiment{exp}-annotation-updated.csv', index=False)

Updating A
Updating B
Updating C
Updating D
Updating E
Updating F


In [5]:
updated_anns = dict((
    exp, pd.read_csv(
        f'../experiment{exp}/idr0071-experiment{exp}-annotation-updated.csv'))
    for exp in 'ABCDEF')

for exp in 'ABCDEF':
    anns = updated_anns[exp]
    dataset_diff = (set(anns['Dataset Name'].unique())
            .symmetric_difference(
            images[f'idr0071-feldman-crisprko/experiment{exp}'].keys()))
    print(f'\nUnmatched datasets: {exp}')
    print('\t' + '\n\t'.join(sorted(dataset_diff)))
    for dname in anns['Dataset Name'].unique():
        images_in_ann = set(anns.loc[anns['Dataset Name'] == dname]['Image Name'].unique())
        images_in_dataset = images[f'idr0071-feldman-crisprko/experiment{exp}'][dname]
        images_diff = images_in_ann.symmetric_difference(images_in_dataset)
        if images_diff:
            print(f'\nUnmatched images: {dname}')
            print('\t' + '\n\t'.join(sorted(images_diff)))

  if (await self.run_code(code, result,  async_=asy)):



Unmatched datasets: A
	

Unmatched datasets: B
	

Unmatched datasets: C
	

Unmatched datasets: D
	

Unmatched datasets: E
	

Unmatched datasets: F
	
