### Pix2Pix modeling

Model repo: https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix

Paper: https://arxiv.org/pdf/1611.07004.pdf

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from cellvision_lib import train_test_val
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict

def train_test_val4(folder_path, channels = 1, train_pp = .6, test_pp = .2, val_pp = .2, set_seed = 1):
    #check to make sure percentages equal 1
    if train_pp+test_pp+val_pp != 1:
        raise Exception("train, test, val percentages must equal 1")
    
    #check to make sure channels in 1-5
    for c in list((channels,6)):
        if c not in list(range(1,7)):
            raise Exception("channels can only be in range 1-5")
    
    #Find file paths
    for root, dirs, files in os.walk(folder_path):
        root_path = root
        root_dir = dirs
        root_files = files
    
    #split each file to find relevant numbers
    sample_zplanes_folder = []
    for file in root_files:
        s,c,z = file.split('_')
        s_n = s.strip('sample')
        c_n = c.strip('channel')
        z_n = z.strip('z').split('.')[0]
        #create new entries where first part of key is sample #
        #second part of key is zplane #
        entry = list([str(s_n)+str('_')+str(z_n), file])
        sample_zplanes_folder.append(entry)
    
    #create dictionary with new keys for sample # and zplane #
    d = defaultdict(list)
    for key, entry in sample_zplanes_folder:
        d[key].append(entry)
    
    #full dictionary of files with corresponding sample/zplane #
    samples = d
    #just sample/zplane keys used to split data
    samples_list = list(samples.keys())


    #set number entries base on pp's for train, test, val
    train_p, test_p, val_p = round((len(samples_list))*train_pp), \
                             round((len(samples_list))*test_pp), \
                             round((len(samples_list))*val_pp)

    #set train seed
    np.random.seed(set_seed)
    #select training set
    train = list(np.random.choice(samples_list, size=train_p, replace=False))
    #remove training set from original list
    samples_list = list(set(samples_list) - set(train))

    #set test seed
    np.random.seed(set_seed)
    #select training set
    test = list(np.random.choice(samples_list, size=test_p, replace=False))
    #remove training set from original list
    val = list(set(samples_list) - set(test))
    
    #set channel list
    channel_set = list((channels,6))
    
    def finalize_paths(split_set, samples):
        paths = []
        for t in split_set:
            entry = samples[t]
            paths.append(entry)

        final = []
        for file in paths:
            temp = []
            for channel_file in file:
                s,c,z = channel_file.split('_')
                c_n = c.strip('channel')
                for c in channel_set:
                    if c_n == str(c):
                        temp.append('{}/{}'.format(root_path, channel_file))
            final.append(sorted(temp))  
        return final
    
    train_final = finalize_paths(train, samples)
    test_final = finalize_paths(test, samples)
    val_final = finalize_paths(val, samples)

    return(train_final, test_final, val_final)

In [3]:
# Proprocesing the data for pix2pix model
import os
import glob
from shutil import copyfile

MAX_DEPTH = 100
NUM_SAMPLES = 109

# folder_path = '/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-normalized'
# train, test, val = train_test_val(folder_path, channels = 1, train_pp = .67, test_pp = .165, val_pp = .165, set_seed = 1)

# train[0:10]

def clear_test_files(pix2pix_path):
    outer_paths = ['A','B']
    inner_paths = ['test','train','val']
    for outer in outer_paths:
        for inner in inner_paths:
            print('{root}/{split}/{inner}'.format(root=pix2pix_path, split=outer, inner=inner))
            files = glob.glob('{root}/{split}/{inner}/*'.format(root=pix2pix_path, split=outer, inner=inner))
            for f in files:
                os.remove(f)
                
def setup_images_for_pix2pix():
    pix_folder_path = '/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing'
    clear_test_files(pix_folder_path)
    folder_path = '/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-normalized'
    train, test, val = train_test_val(folder_path, 
                                      channels = 1, 
                                      train_pp = .67, 
                                      test_pp = .165, 
                                      val_pp = .165, 
                                      set_seed = 1)
    train_images = train[0:10]
    test_images = test[0:10]
    val_images = val[0:10]
    print(train_images)
    
    for i, (comp, ref) in enumerate(train_images):
        print(i)
        print(comp)
        print(ref)
        new_comp_path = pix_folder_path + '/A/train/{}.img'.format(i)
        new_ref_path = pix_folder_path + '/B/train/{}.img'.format(i)
        copyfile(comp, new_comp_path)
        copyfile(ref, new_ref_path)
    
    for comp, ref in test_images:
        new_comp_path = pix_folder_path + '/A/test/{}.img'.format(i)
        new_ref_path = pix_folder_path + '/B/test/{}.img'.format(i)
        copyfile(comp, new_comp_path)
        copyfile(ref, new_ref_path)
        
    for comp, ref in val_images:
        new_comp_path = pix_folder_path + '/A/val/{}.img'.format(i)
        new_ref_path = pix_folder_path + '/B/val/{}.img'.format(i)
        copyfile(comp, new_comp_path)
        copyfile(ref, new_ref_path)


setup_images_for_pix2pix()
print(glob.glob('/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing/A/train'))
print(glob.glob('/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing/B/train'))

# print(len(channel1_comps))

/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing/A/test
/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing/A/train
/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing/A/val
/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing/B/test
/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing/B/train
/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-pix2pix/testing/B/val
[['/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-normalized/sample68_channel1_z93.tif', '/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-normalized/sample68_channel6_z93.tif'], ['/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-normalized/sample57_channel1_z33.tif', '/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-normalized/sample57_channel6_z33.tif'], ['/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-normalized/sample74_channel1_z35.tif', '/gpfs/data/lionnetlab/cellvision/pilotdata/20181009-normalized/sample74_channel6_z3