### Prepare fcs files for deep learning
This is a small example for formatting data from fcs files into numpy array, and save the metaData, marker names and the numpy array into allData.obj file. Use the script as a template to prepare your own fcs files for deep learning. 

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import rpy2 as rp
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import os 
import rpy2.robjects as ro
import pickle
from collections import Counter
from pathlib import Path
import csv
import multiprocessing as mp

# import R's "flowCore" package
utils = importr('flowCore')

In [2]:
##### list fcs files #####
cytof_files = pd.read_csv("metaData.csv")
print(cytof_files)
fn = [os.path.join(os.getcwd(),f) for f in cytof_files.name]

          name study_accession  CMV_Ab
0  sample1.fcs          study1    True
1  sample2.fcs          study2   False
2  sample3.fcs          study3    True


In [3]:
aging_dir = Path('/home/ubuntu/a/aging/data')
mfest = aging_dir/Path('SDY420-DR40_Subject_2_Flow_cytometry_result.txt')
fcs_dir = aging_dir/Path('ResultFiles/Flow_cytometry_result')
prediction_target = 'Subject Age'
fcs_files = []
with mfest.open() as f:
    reader = csv.DictReader(f, delimiter='\t', lineterminator='\n')
    for d in reader:
        stain = d['Expsample Treatement']
        if stain != 'Unstim':
            continue
        fp = fcs_dir/d['File Name']
        fcs_files.append((int(d[prediction_target]), fp.resolve()))
fn = [x[1] for x in fcs_files]

In [4]:
def import_fcs(fcs_path):
    fcs_path = str(fcs_path)
    r = rp.robjects.r
    r_code = ("library(flowCore);"+
          "library(MetaCyto);"+
          "fn = '"+ fcs_path+ "'; "+
          "fcs = read.FCS(fn,truncate_max_range = FALSE);"+
          "expr = fcs@exprs;"+
          "markers = markerFinder(fcs);"+
          "colnames(expr) = markers;"+
          "expr = as.data.frame(expr);"+
         # subsample 10,000 cells
         "expr = expr[sample(1:nrow(expr),10000,replace = TRUE),]")
    expr =  r(r_code)
    return expr

In [5]:
pool = mp.Pool()
expr_list = pool.map(import_fcs, fn)
pool.close()
pool.terminate()

In [6]:
expr_list[0]

FSC-A,FSC-H,FSC-W,...,PSTAT3,CD4 / CD19,TIME
96148.921875,94316.000000,66809.617188,...,525.909973,4142.600098,5480.200195
262143.000000,207621.000000,82745.992188,,1063.369995,5144.369629,5784.899902
71296.500000,60788.000000,76865.289062,,780.779968,4697.000000,7269.399902
107906.101562,101668.000000,69557.125000,,269.500000,1683.219971,2911.000000
...,...,...,,...,...,...
97579.203125,85367.000000,74911.273438,,1115.729980,4070.219971,1259.400024
82043.609375,76722.000000,70081.726562,,453.529999,4227.299805,5085.299805
52709.820312,45561.000000,75819.031250,,847.769958,3373.369873,765.700012
108647.343750,103233.000000,68973.218750,,257.179993,1293.599976,6191.700195


In [7]:
expr_list[0].colnames

0,1,2,3,4,5,6
'FSC-A','FSC-H','FSC-W',...,'PSTAT3','CD4 / CD...,'TIME'


In [20]:
len(expr_list)

227

In [31]:
##### get common markers #####
markers = [_ for df in expr_list for _ in df.colnames]
marker_counts = Counter(markers)
markers = [k for k, c in marker_counts.items() if c == len(expr_list)]
print(markers)
sorted(marker_counts.items(), key=lambda _:_[1], reverse=True)

['FSC-A', 'FSC-H', 'FSC-W', 'SSC-A', 'SSC-H', 'SSC-W', 'TIME']


[('FSC-A', 227),
 ('FSC-H', 227),
 ('FSC-W', 227),
 ('SSC-A', 227),
 ('SSC-H', 227),
 ('SSC-W', 227),
 ('TIME', 227),
 ('CD66B', 223),
 ('CD33', 223),
 ('PSTAT1', 164),
 ('PSTAT5', 164),
 ('PSTAT3', 164),
 ('CD4 / CD19', 158),
 ('APC-ALEXA 750-A', 112),
 ('PACIFIC ORANGE-A', 112),
 ('CD3 / CD14', 87),
 ('CD3/CD14', 76),
 ('CD4/CD19', 65),
 ('PSTAT-1', 59),
 ('PSTAT-5', 59),
 ('PSTAT-3', 59),
 ('CD14/CD3', 24),
 ('CD14 / CD3', 24),
 ('CD3/ CD14', 12),
 ('FITC-A', 4),
 ('PE-A', 4),
 ('PERCP-CY5-5-A', 4),
 ('PE-CY7-A', 4),
 ('PACIFIC BLUE-A', 4),
 ('APC-A', 4),
 ('ALEXA 700-A', 4)]

In [6]:
##### get common markers #####
markers = []
for i in range(len(expr_list)):
    markers.extend(expr_list[i].colnames)

# markers = Counter(markers)
# markers = [k for k, c in markers.items() if c == 3]
print(markers)

for i in range(0,len(expr_list)):
    t1 = expr_list[i] 
    with localconverter(ro.default_converter + pandas2ri.converter):
        t1 = ro.conversion.rpy2py(t1)
    expr_list[i] = t1.loc[:,markers]

[]


In [7]:
##### transform and format into numpy array
def arcsinh(x):
    return(np.arcsinh(x/5))

coln = expr_list[0].columns.drop("TIME")
for i in range(len(expr_list)):
    t1 = expr_list[i].drop(columns="TIME")
    t1 = t1.apply(arcsinh)
    t1 = t1.values
    shape1 = list(t1.shape)+[1]
    t1 = t1.reshape(shape1)
    expr_list[i] = t1
    
expr_list = np.stack(expr_list)
print("The dimenstion of the data is: ", expr_list.shape)

KeyError: "['TIME'] not found in axis"

In [7]:
allData = {"cytof_files":cytof_files, 
            "expr_list" : expr_list,
            "marker_names" : coln}

with open("allData.obj", "wb") as f:
    pickle.dump(allData, f)