### Prepare fcs files for deep learning
This is a small example for formatting data from fcs files into numpy array, and save the metaData, marker names and the numpy array into allData.obj file. Use the script as a template to prepare your own fcs files for deep learning. 

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import rpy2 as rp
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import os 
import rpy2.robjects as ro
import pickle
from collections import Counter


# import R's "flowCore" package
utils = importr('flowCore')

In [2]:
##### list fcs files #####
cytof_files = pd.read_csv("metaData.csv")
print(cytof_files)
fn = [os.path.join(os.getcwd(),f) for f in cytof_files.name]

          name study_accession  CMV_Ab
0  sample1.fcs          study1    True
1  sample2.fcs          study2   False
2  sample3.fcs          study3    True


In [3]:
##### read fcs file using the flowCore R package #####
# flowCore is a very well maintained R package for reading and analyzing fcs files
# Many of the fcs file related packages in python are a little buggy to use
# Therefore, it is worth the trouble to read the fcs files using R 

r = rp.robjects.r
expr_list = []
for i in range(0,len(fn)):
    fn_i = fn[i]
    r_code = ("library(flowCore);"+
          "library(MetaCyto);"+
          "fn = '"+ fn_i+ "'; "+
          "fcs = read.FCS(fn,truncate_max_range = FALSE);"+
          "expr = fcs@exprs;"+
          "markers = markerFinder(fcs);"+
          "colnames(expr) = markers;"+
          "expr = as.data.frame(expr);"+
         # subsample 10,000 cells
         "expr = expr[sample(1:nrow(expr),10000,replace = TRUE),]")
    expr =  r(r_code)
    expr_list.append(expr)

In [4]:
expr_list[0]

TIME,EVENT_LENGTH,VIABILITY,...,PT195DI,CD45_CONV,CD11B
582181.750000,27.000000,1.506658,...,0.000000,0.000000,67.702408
559240.750000,42.000000,16.723312,,0.000000,0.757744,34.693893
17675.599609,30.000000,0.000000,,10.260152,0.391091,23.878590
124038.945312,33.000000,0.055182,,1.671392,7.317442,463.809601
...,...,...,,...,...,...
668463.187500,46.000000,0.000000,,11.606791,0.000000,23.184088
560332.312500,53.000000,0.000000,,0.000000,0.000000,0.690570
944064.187500,43.000000,10.153085,,1.330973,5.900362,243.774139
5112.617188,25.000000,0.000000,,3.251579,2.731082,1.664887


In [5]:
expr_list[0].colnames

0,1,2,3,4,5,6
'TIME','EVENT_LE...,'VIABILITY',...,'PT195DI','CD45_CONV','CD11B'


In [19]:
##### get common markers #####
markers = []
for i in range(len(expr_list)):
    markers.extend(expr_list[i].colnames)
marker_counts = Counter(markers)
markers = [k for k, c in marker_counts.items() if c == 3]
print(markers)
sorted(marker_counts.items(), key=lambda _:_[1], reverse=True)

['TIME', 'CD57', 'CD19', 'CD45RA', 'CD4', 'CD8', 'CD20', 'CD16', 'CD127', 'CD123', 'CXCR5', 'CD86', 'CD27', 'CD11C', 'CD14', 'CD56', 'CCR6', 'CD25', 'CCR7', 'CD3', 'CD38', 'CD161', 'CXCR3', 'HLADR', 'CD11B']


[('TIME', 3),
 ('CD57', 3),
 ('CD19', 3),
 ('CD45RA', 3),
 ('CD4', 3),
 ('CD8', 3),
 ('CD20', 3),
 ('CD16', 3),
 ('CD127', 3),
 ('CD123', 3),
 ('CXCR5', 3),
 ('CD86', 3),
 ('CD27', 3),
 ('CD11C', 3),
 ('CD14', 3),
 ('CD56', 3),
 ('CCR6', 3),
 ('CD25', 3),
 ('CCR7', 3),
 ('CD3', 3),
 ('CD38', 3),
 ('CD161', 3),
 ('CXCR3', 3),
 ('HLADR', 3),
 ('CD11B', 3),
 ('DNA', 2),
 ('CELL_LENGTH', 2),
 ('DEAD', 2),
 ('BEAD', 2),
 ('IGD', 2),
 ('CD85J', 2),
 ('CD94', 2),
 ('CD33', 2),
 ('CD28', 2),
 ('CD24', 2),
 ('ICOS', 2),
 ('TCRGD', 2),
 ('PD-1', 2),
 ('DNA1', 2),
 ('DNA2', 2),
 ('EVENT_LENGTH', 1),
 ('VIABILITY', 1),
 ('CD45', 1),
 ('XE131DI', 1),
 ('CS133DI', 1),
 ('BA138DI', 1),
 ('CE140DI', 1),
 ('PR141DI', 1),
 ('CE142DI', 1),
 ('CD141', 1),
 ('CD1C', 1),
 ('CD66B', 1),
 ('CCR5', 1),
 ('CHIKV', 1),
 ('CD80', 1),
 ('CCR4', 1),
 ('CD40', 1),
 ('CX3CR1', 1),
 ('CD209', 1),
 ('PD1', 1),
 ('LU176DI', 1),
 ('CD54', 1),
 ('OS189DI', 1),
 ('CD45_ACUTE', 1),
 ('PT195DI', 1),
 ('CD45_CONV', 1)]

In [None]:
for i in range(0,len(expr_list)):
    t1 = expr_list[i] 
    with localconverter(ro.default_converter + pandas2ri.converter):
        t1 = ro.conversion.rpy2py(t1)
    expr_list[i] = t1.loc[:,markers]

In [5]:
##### transform and format into numpy array
def arcsinh(x):
    return(np.arcsinh(x/5))

coln = expr_list[0].columns.drop("TIME")
for i in range(len(expr_list)):
    t1 = expr_list[i].drop(columns="TIME")
    t1 = t1.apply(arcsinh)
    t1 = t1.values
    shape1 = list(t1.shape)+[1]
    t1 = t1.reshape(shape1)
    expr_list[i] = t1
    
expr_list = np.stack(expr_list)
print("The dimenstion of the data is: ", expr_list.shape)

The dimenstion of the data is:  (3, 10000, 24, 1)


In [6]:
allData = {"cytof_files":cytof_files, 
            "expr_list" : expr_list,
            "marker_names" : coln}

with open("allData.obj", "wb") as f:
    pickle.dump(allData, f)