### Prepare fcs files for deep learning
This is a small example for formatting data from fcs files into numpy array, and save the metaData, marker names and the numpy array into allData.obj file. Use the script as a template to prepare your own fcs files for deep learning. 

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import rpy2 as rp
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import os 
import rpy2.robjects as ro
import pickle
from collections import Counter
from pathlib import Path
import csv
import multiprocessing as mp
import matplotlib.pyplot as plt
import re

In [3]:
base_dir = Path('../../aging/data')
print(base_dir.resolve())
pickle_dir = base_dir/'ResultFiles'/'pickles'
pickle_dir.mkdir(exist_ok=True)
fcs_dir = base_dir/Path('ResultFiles/Flow_cytometry_result')

mfest_path = base_dir/Path('SDY420-DR40_Subject_2_Flow_cytometry_result.txt')
mfest = pd.read_csv(mfest_path, sep='\t')

prediction_target = 'Subject Age'

/home/ubuntu/a/aging/data


In [4]:
def fcs2pkl_filename(fcs_path):
    return pickle_dir/fcs_path.with_suffix('.pkl').name

def import_fcs(fcs_path):
    fcs_path = str(fcs_path)
    r = rp.robjects.r
    r_code = ("library(flowCore);"+
          "library(MetaCyto);"+
          "fn = '"+ fcs_path+ "'; "+
          "fcs = read.FCS(fn,truncate_max_range = FALSE);"+
          "expr = fcs@exprs;"+
          "markers = markerFinder(fcs);"+
          "colnames(expr) = markers;"+
          "expr = as.data.frame(expr);"
         )
    expr =  r(r_code)
    df = pandas2ri.rpy2py(expr)
    df.columns = pd.Series([re.sub('\s*/\s*','/',_) for _ in df.columns])
    df.drop(columns=['TIME'], inplace=True)
    return df

def write_dataframe(fcs_path):
    df = import_fcs(fcs_path)
    wf = fcs2pkl_filename(fcs_path)
    print(wf)
    df.to_pickle(wf)

def load_dataframe(fcs_path):
    pkl = fcs2pkl_filename(fcs_path)
    return pd.read_pickle(pkl)

def read_df_metadata(arguments):
    mfest_index = arguments[0]
    pkl_path = arguments[1].pkl
    df = pd.read_pickle(pkl_path)
    return {
        'i': mfest_index,
        'n_rows': int(df.shape[0]),
        'n_columns': int(df.shape[1]),
        'markers': list(df.columns),
    }

In [5]:
mfest['fcs'] = mfest.apply(lambda row: (fcs_dir/row['File Name']).resolve(), axis=1)
mfest['pkl'] = mfest.apply(lambda row: fcs2pkl_filename(fcs_dir/row['fcs']).resolve(), axis=1)
mfest['pkl_exists'] = mfest.apply(lambda row: row['pkl'].exists(), axis=1)

In [6]:
pool = mp.Pool()

In [7]:
fcs_to_convert = mfest[~(mfest['pkl_exists'])]['fcs']
pool.map(write_dataframe, fcs_to_convert)
mfest['pkl_exists'] = mfest.apply(lambda row: row['pkl'].exists(), axis=1)

In [8]:
mfest['n_rows'] = pd.Series(dtype='int64')
mfest['n_columns'] = pd.Series(dtype='int64')
mfest['markers'] = pd.Series(dtype='object')

df_metadata = pool.map(read_df_metadata, mfest.iterrows())

for md in df_metadata:
    i = md['i']
    mfest.at[i,'n_rows'] = md['n_rows']
    mfest.at[i,'n_columns'] = md['n_columns']
    mfest.at[i,'markers'] = md['markers']

{'i': 0, 'n_rows': 41826, 'n_columns': 13, 'markers': ['FSC-A', 'FSC-H', 'FSC-W', 'SSC-A', 'SSC-H', 'SSC-W', 'PSTAT1', 'PSTAT5', 'CD66B', 'CD33', 'CD3/CD14', 'PSTAT3', 'CD4/CD19']}
{'i': 1, 'n_rows': 41872, 'n_columns': 13, 'markers': ['FSC-A', 'FSC-H', 'FSC-W', 'SSC-A', 'SSC-H', 'SSC-W', 'PSTAT1', 'PSTAT5', 'CD66B', 'CD33', 'CD3/CD14', 'PSTAT3', 'CD4/CD19']}
{'i': 2, 'n_rows': 30830, 'n_columns': 13, 'markers': ['FSC-A', 'FSC-H', 'FSC-W', 'SSC-A', 'SSC-H', 'SSC-W', 'PSTAT1', 'PSTAT5', 'CD66B', 'CD33', 'CD3/CD14', 'PSTAT3', 'CD4/CD19']}
{'i': 3, 'n_rows': 49194, 'n_columns': 13, 'markers': ['FSC-A', 'FSC-H', 'FSC-W', 'SSC-A', 'SSC-H', 'SSC-W', 'PSTAT1', 'PSTAT5', 'CD66B', 'CD33', 'CD3/CD14', 'PSTAT3', 'CD4/CD19']}
{'i': 4, 'n_rows': 74924, 'n_columns': 13, 'markers': ['FSC-A', 'FSC-H', 'FSC-W', 'SSC-A', 'SSC-H', 'SSC-W', 'PSTAT1', 'PSTAT5', 'CD66B', 'CD33', 'CD3/CD14', 'PSTAT3', 'CD4/CD19']}
{'i': 5, 'n_rows': 500000, 'n_columns': 15, 'markers': ['FSC-A', 'FSC-H', 'FSC-W', 'SSC-A', 'SS

In [9]:
pool.close()
pool.terminate()

In [10]:
mfest.to_csv('fcs_metadata.csv')
mfest

Unnamed: 0,Subject Accession,Species,Race,Race Specify,Ethnicity,Strain,Gender,Age Event,Age Event Specify,Subject Age,...,File Info ID,File Detail,File Name,Original File Name,fcs,pkl,pkl_exists,n_rows,n_columns,markers
0,SUB137174,Homo sapiens,Black or African American,,Not Hispanic or Latino,,Female,Age at enrollment,,53,...,532468,Flow cytometry result,RC4_080513_RC4_080513_11-020_IFNa_B04.532468.fcs,RC4_080513_RC4_080513_11-020_IFNa_B04.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,41826.0,13.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, PST..."
1,SUB137174,Homo sapiens,Black or African American,,Not Hispanic or Latino,,Female,Age at enrollment,,53,...,532469,Flow cytometry result,RC4_080513_RC4_080513_11-020_IL10_D04.532469.fcs,RC4_080513_RC4_080513_11-020_IL10_D04.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,41872.0,13.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, PST..."
2,SUB137174,Homo sapiens,Black or African American,,Not Hispanic or Latino,,Female,Age at enrollment,,53,...,532470,Flow cytometry result,RC4_080513_RC4_080513_11-020_IL21_E04.532470.fcs,RC4_080513_RC4_080513_11-020_IL21_E04.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,30830.0,13.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, PST..."
3,SUB137174,Homo sapiens,Black or African American,,Not Hispanic or Latino,,Female,Age at enrollment,,53,...,532471,Flow cytometry result,RC4_080513_RC4_080513_11-020_IL6_C04.532471.fcs,RC4_080513_RC4_080513_11-020_IL6_C04.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,49194.0,13.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, PST..."
4,SUB137174,Homo sapiens,Black or African American,,Not Hispanic or Latino,,Female,Age at enrollment,,53,...,532472,Flow cytometry result,RC4_080513_RC4_080513_11-020_US_A04.532472.fcs,RC4_080513_RC4_080513_11-020_US_A04.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,74924.0,13.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, PST..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1125,SUB147387,Homo sapiens,Not Specified,,Not Specified,,Not Specified,Not Specified,,99,...,533134,Flow cytometry result,s_3-control-2.533134.fcs,s_3-control-2.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,500000.0,15.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, CD6..."
1126,SUB147387,Homo sapiens,Not Specified,,Not Specified,,Not Specified,Not Specified,,99,...,533134,Flow cytometry result,s_3-control-2.533134.fcs,s_3-control-2.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,500000.0,15.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, CD6..."
1127,SUB147387,Homo sapiens,Not Specified,,Not Specified,,Not Specified,Not Specified,,99,...,533134,Flow cytometry result,s_3-control-2.533134.fcs,s_3-control-2.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,500000.0,15.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, CD6..."
1128,SUB147387,Homo sapiens,Not Specified,,Not Specified,,Not Specified,Not Specified,,99,...,533134,Flow cytometry result,s_3-control-2.533134.fcs,s_3-control-2.fcs,/home/ubuntu/a/aging/data/ResultFiles/Flow_cyt...,/home/ubuntu/a/aging/data/ResultFiles/pickles/...,True,500000.0,15.0,"[FSC-A, FSC-H, FSC-W, SSC-A, SSC-H, SSC-W, CD6..."


In [11]:
marker_counts = Counter()
for markers in mfest['markers']:
    marker_counts.update(markers)
marker_counts

Counter({'FSC-A': 1130,
         'FSC-H': 1130,
         'FSC-W': 1130,
         'SSC-A': 1130,
         'SSC-H': 1130,
         'SSC-W': 1130,
         'PSTAT1': 819,
         'PSTAT5': 819,
         'CD66B': 1114,
         'CD33': 1114,
         'CD3/CD14': 874,
         'PSTAT3': 819,
         'CD4/CD19': 1114,
         'PSTAT-1': 295,
         'PSTAT-5': 295,
         'CD14/CD3': 240,
         'PSTAT-3': 295,
         'APC-ALEXA 750-A': 560,
         'PACIFIC ORANGE-A': 560,
         'FITC-A': 16,
         'PE-A': 16,
         'PERCP-CY5-5-A': 16,
         'PE-CY7-A': 16,
         'PACIFIC BLUE-A': 16,
         'APC-A': 16,
         'ALEXA 700-A': 16})

In [23]:
def filter_by_markers(df, q_markers):
    q_markers = set(q_markers)
    return df[df.apply(
        lambda row: len(q_markers & set(row['markers'])) == len(q_markers),
    axis=1
    )]   
markers = ['FSC-A','FSC-H','FSC-W','SSC-A','SSC-H','SSC-W','PSTAT1','PSTAT5','CD66B','CD33','CD3/CD14','PSTAT3','CD4/CD19']
samples = filter_by_markers(mfest, markers)
samples = samples[samples.n_rows>1e4]
samples.shape

(787, 46)

In [18]:
len(markers)

12

In [20]:
samples.n_rows.describe()

count       787.000000
mean     257858.677255
std      195865.509198
min       10041.000000
25%       81775.000000
50%      174856.000000
75%      500000.000000
max      771997.000000
Name: n_rows, dtype: float64

In [21]:
Counter(samples['Expsample Treatement'])

Counter({'interferon alpha': 156,
         'IL-10': 161,
         'IL-21': 156,
         'IL-6': 156,
         'Unstim': 158})

In [22]:
fcs_df = pd.read_pickle(samples.iloc[0].pkl)

fcs_df

Unnamed: 0,FSC-A,FSC-H,FSC-W,SSC-A,SSC-H,SSC-W,PSTAT1,PSTAT5,CD66B,CD33,CD3/CD14,PSTAT3,CD4/CD19
1,105764.164062,74308.0,93278.781250,262143.000000,256592.0,66953.781250,506.679962,-4.240000,23.320000,1240.199951,1085.800049,1535.380005,70.070000
2,77193.359375,67022.0,75481.835938,137303.906250,95466.0,94257.109375,1093.919922,799.239929,558.619995,287.259979,1178.520020,363.440002,560.559998
3,55635.628906,51456.0,70859.312500,50348.937500,41268.0,79957.054688,407.039978,463.219971,612.679993,505.619965,85.400002,127.820000,315.699982
4,58125.570312,52400.0,72696.898438,63292.597656,48224.0,86014.093750,753.659973,481.239960,9193.379883,2867.299805,1163.270020,645.260010,5482.399902
5,69993.242188,64830.0,70755.468750,60593.835938,52974.0,74962.765625,778.039978,470.639984,7899.119629,3538.279785,1026.020020,546.700012,4466.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41822,64154.671875,57778.0,72768.882812,47112.757812,41174.0,74988.625000,587.239990,354.039978,13847.838867,5809.859863,871.080017,667.589966,3037.649902
41823,262143.000000,256280.0,67035.289062,262143.000000,256684.0,66929.781250,10040.319336,6278.379883,7396.679688,5583.019531,13742.690430,4083.309814,6661.270020
41824,262143.000000,152453.0,112689.179688,262143.000000,126577.0,135726.109375,2993.439941,2092.439941,2888.499756,1129.959961,4653.689941,1021.789978,2018.939941
41825,93842.546875,68284.0,90065.968750,59147.996094,40268.0,96263.109375,762.139954,561.799988,753.659973,220.479980,1664.080078,248.709991,519.750000


In [None]:
# expr_list = [
#     pd.read_pickle(row['pkl']).iloc[:1000,:]
#     for i, row in
#     mfest[mfest['Expsample Treatement']=='Unstim'].iterrows()
# ]

In [None]:
##### get common markers #####
markers = [_ for df in expr_list for _ in df.colnames]
marker_counts = Counter(markers)
markers = [k for k, c in marker_counts.items() if c == len(expr_list)]
print(markers)
sorted(marker_counts.items(), key=lambda _:_[1], reverse=True)

In [None]:
##### get common markers #####
markers = []
for i in range(len(expr_list)):
    markers.extend(expr_list[i].colnames)

# markers = Counter(markers)
# markers = [k for k, c in markers.items() if c == 3]
print(markers)

for i in range(0,len(expr_list)):
    t1 = expr_list[i] 
    with localconverter(ro.default_converter + pandas2ri.converter):
        t1 = ro.conversion.rpy2py(t1)
    expr_list[i] = t1.loc[:,markers]

In [None]:
##### transform and format into numpy array
def arcsinh(x):
    return(np.arcsinh(x/5))

coln = expr_list[0].columns.drop("TIME")
for i in range(len(expr_list)):
    t1 = expr_list[i].drop(columns="TIME")
    t1 = t1.apply(arcsinh)
    t1 = t1.values
    shape1 = list(t1.shape)+[1]
    t1 = t1.reshape(shape1)
    expr_list[i] = t1
    
expr_list = np.stack(expr_list)
print("The dimenstion of the data is: ", expr_list.shape)

In [None]:
allData = {"cytof_files":cytof_files, 
            "expr_list" : expr_list,
            "marker_names" : coln}

with open("allData.obj", "wb") as f:
    pickle.dump(allData, f)