### Prepare fcs files for deep learning
This is a small example for formatting data from fcs files into numpy array, and save the metaData, marker names and the numpy array into allData.obj file. Use the script as a template to prepare your own fcs files for deep learning. 

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import rpy2 as rp
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import os 
import rpy2.robjects as ro
import pickle
from collections import Counter
from pathlib import Path
import csv
import multiprocessing as mp
import matplotlib.pyplot as plt
import re
import fcsparser
import traceback

In [2]:
base_dir = Path('../../aging/data')
base_dir = base_dir.resolve()
print(base_dir)
pickle_dir = base_dir/'ResultFiles'/'cytof-pickles'
pickle_dir.mkdir(exist_ok=True)
print(pickle_dir)
fcs_dir = base_dir/Path('ResultFiles/CyTOF_result')
print(fcs_dir)

mfest_path = base_dir/Path('SDY420-DR40_Subject_2_CyTOF_result.txt')
mfest_all = pd.read_csv(mfest_path, sep='\t')

/home/ubuntu/a/aging/data
/home/ubuntu/a/aging/data/ResultFiles/cytof-pickles
/home/ubuntu/a/aging/data/ResultFiles/CyTOF_result


In [3]:
print(mfest_all.shape)
mfest_all.columns

(556, 40)


Index(['Subject Accession', 'Species', 'Race', 'Race Specify', 'Ethnicity',
       'Strain', 'Gender', 'Age Event', 'Age Event Specify', 'Subject Age',
       'Age Unit', 'Subject Phenotype', 'ARM Accession', 'ARM Name',
       'Study Accession', 'Study Brief Title', 'Biosample Accession',
       'Biosample Description', 'Biosample Name', 'Biosample Type',
       'Biosample Subtype', 'Study Time Collected',
       'Study Time Collected Unit', 'Study Time T0 Event',
       'Study Time T0 Event Specify', 'Biosample Treatment',
       'Planned Visit Accession', 'Planned Visit Name', 'Experiment Accession',
       'Expsample Accession', 'Expsample Description',
       'Expsample Result Schema', 'Expsample Treatement', 'Reagent Accession',
       'Reagent Name', 'Reagent Reporter Name', 'File Info ID', 'File Detail',
       'File Name', 'Original File Name'],
      dtype='object')

In [4]:
def fcs2pkl_filename(fcs_path):
    return pickle_dir/fcs_path.with_suffix('.pkl').name

def import_fcs(fcs_path):
    fcs_path = str(fcs_path)
    r = rp.robjects.r
    r_code = ("library(flowCore);"+
          "library(MetaCyto);"+
          "fn = '"+ fcs_path+ "'; "+
          "fcs = read.FCS(fn,truncate_max_range = FALSE);"+
          "expr = fcs@exprs;"+
          "markers = markerFinder(fcs);"+
          "colnames(expr) = markers;"+
          "expr = as.data.frame(expr);"
         )
    expr =  r(r_code)
    df = pandas2ri.rpy2py(expr)
    df.columns = pd.Series([re.sub('\s*/\s*','/',_) for _ in df.columns])
    df.drop(columns=['TIME'], inplace=True)
    return df

def write_dataframe(fcs_path):
    if fcs_path.suffix != '.fcs':
        return
    df = import_fcs(fcs_path)
    wf = fcs2pkl_filename(fcs_path)
    df.to_pickle(wf)

def load_dataframe(fcs_path):
    pkl = fcs2pkl_filename(fcs_path)
    return pd.read_pickle(pkl)

def read_df_metadata(arguments):
    mfest_index = arguments[0]
    pkl_path = arguments[1].pkl
    if not pkl_path.exists():
        return None
    df = pd.read_pickle(pkl_path)
    return {
        'i': mfest_index,
        'n_rows': int(df.shape[0]),
        'n_columns': int(df.shape[1]),
        'markers': list(df.columns),
    }

In [6]:
mfest_all['File Name'].apply(lambda fn: fn.endswith('.fcs')).value_counts()

True     284
False    272
Name: File Name, dtype: int64

In [10]:
mfest = mfest_all[mfest_all['File Name'].apply(lambda fn: fn.endswith('.fcs'))].copy()
mfest.shape

(284, 40)

In [11]:
mfest['fcs'] = mfest_all.apply(lambda row: (fcs_dir/row['File Name']).resolve(), axis=1)
mfest['pkl'] = mfest.apply(lambda row: fcs2pkl_filename(fcs_dir/row['fcs']).resolve(), axis=1)
mfest['pkl_exists'] = mfest.apply(lambda row: row['pkl'].exists(), axis=1)

In [12]:
mfest.pkl_exists.value_counts()

True    284
Name: pkl_exists, dtype: int64

In [13]:
spot_check = import_fcs(mfest['fcs'].iloc[0])
spot_check

Unnamed: 0,CELL_LENGTH,DEAD,CD19,CD4,CD8,IGD,CD85J,CD16,CD3,CD38,...,CD33,CD28,CD24,CD161,TCRGD,CD56,HLADR,CD25,DNA1,DNA2
1,26.0,-0.534810,-0.883897,1.706276,-0.389549,1.095162,49.757641,12.978461,16.902872,4687.463379,...,-0.372071,-0.027616,-0.383645,-0.906472,-0.242294,-0.225837,-0.867546,-0.280114,202.016907,431.376648
2,37.0,-0.501825,-0.987176,60.403698,-0.105791,7.882118,-0.244238,-0.173425,249.101242,38.992962,...,11.001021,-0.182922,-0.707323,-0.652739,-0.600262,-0.914205,4.501838,-0.919009,113.373032,198.430557
3,20.0,-0.937247,-0.723077,65.363335,-0.308973,1.983649,1.986390,1.593549,148.718506,67.259468,...,1.809853,3.281014,-0.941640,-0.512368,9.840481,-0.139726,-0.828036,-0.629730,162.773148,482.315369
4,22.0,-0.572791,0.669641,60.976925,3.412380,1.510986,-0.677291,0.521174,37.451569,-0.306992,...,-0.503387,19.955927,-0.111736,2.431672,-0.597246,-0.695127,-0.014068,1.166284,308.275940,470.220825
5,32.0,0.022631,-0.114596,4.590069,1.202479,0.104959,152.993073,0.917886,-0.331726,12.559181,...,142.294800,2.811678,-0.504452,-0.492080,-0.046891,1.905674,5069.940918,39.427078,168.363663,371.216034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150603,33.0,25.963648,-0.736181,-0.811943,-0.430332,-0.358188,-0.758848,0.899444,9.686301,1.237110,...,-0.356411,-0.372350,8.002898,16.131897,-0.624776,-0.286313,-0.912130,-0.679784,268.397095,434.451538
150604,25.0,-0.023051,-0.157124,78.861618,11.280008,-0.081290,-0.991805,1.676905,245.862762,463.958405,...,3.164071,14.314236,2.366328,-0.564090,-0.368061,-0.740030,-0.840149,9.426412,162.650085,315.773132
150605,10.0,15.969790,-0.945064,-0.684829,-0.750039,-0.824051,-0.132160,-0.827413,-0.809735,-0.735797,...,-0.872124,-0.455257,-0.695967,-0.560733,-0.335767,-0.446366,-0.279087,-0.120687,75.549614,63.773636
150606,41.0,-0.046112,0.961171,-0.180391,69.716980,-0.316974,-0.564795,-0.370083,123.145561,17.038406,...,0.185062,14.354417,-0.328775,184.206970,6.678673,-0.850368,-0.076979,-0.556896,203.132095,406.615112


In [14]:
spot_check.columns

Index(['CELL_LENGTH', 'DEAD', 'CD19', 'CD4', 'CD8', 'IGD', 'CD85J', 'CD16',
       'CD3', 'CD38', 'CD27', 'CD14', 'CD94', 'CCR7', 'CD45RA', 'CD20',
       'CD127', 'CD33', 'CD28', 'CD24', 'CD161', 'TCRGD', 'CD56', 'HLADR',
       'CD25', 'DNA1', 'DNA2'],
      dtype='object')

In [15]:
pool = mp.Pool()

In [16]:
fcs_to_convert = mfest[~(mfest['pkl_exists'])]['fcs']
pool.map(write_dataframe, fcs_to_convert)
mfest['pkl_exists'] = mfest.apply(lambda row: row['pkl'].exists(), axis=1)

In [17]:
mfest['n_rows'] = pd.Series(dtype='int64')
mfest['n_columns'] = pd.Series(dtype='int64')
mfest['markers'] = pd.Series(dtype='object')

df_metadata = pool.map(read_df_metadata, mfest.iterrows())

for md in df_metadata:
    if md is None:
        continue
    i = md['i']
    mfest.at[i,'n_rows'] = md['n_rows']
    mfest.at[i,'n_columns'] = md['n_columns']
    mfest.at[i,'markers'] = md['markers']

In [18]:
pool.close()
pool.terminate()

In [19]:
mfest.to_csv('fcs_metadata.csv')
mfest.to_pickle('fcs_metadata.pkl')
mfest

Unnamed: 0,Subject Accession,Species,Race,Race Specify,Ethnicity,Strain,Gender,Age Event,Age Event Specify,Subject Age,...,File Info ID,File Detail,File Name,Original File Name,fcs,pkl,pkl_exists,n_rows,n_columns,markers
0,SUB137160,Homo sapiens,Asian,,Not Hispanic or Latino,,Female,Age at enrollment,,44,...,573505,CyTOF result,RC4 11-0005_cells_found.573505.fcs,RC4 11-0005_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,150607.0,27.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
2,SUB137169,Homo sapiens,White,,Not Hispanic or Latino,,Female,Age at enrollment,,41,...,573507,CyTOF result,Mm 041513-RC4-11-0015_cells_found.573507.fcs,Mm 041513-RC4-11-0015_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,317311.0,27.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
4,SUB137172,Homo sapiens,Asian,,Not Hispanic or Latino,,Female,Age at enrollment,,45,...,573509,CyTOF result,RC4-11-0018_cells_found.573509.fcs,RC4-11-0018_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,222633.0,27.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
6,SUB137174,Homo sapiens,Black or African American,,Not Hispanic or Latino,,Female,Age at enrollment,,53,...,573511,CyTOF result,11-0020_cells_found.573511.fcs,11-0020_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,244654.0,28.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
8,SUB137182,Homo sapiens,White,,Not Hispanic or Latino,,Male,Age at enrollment,,71,...,573513,CyTOF result,053012-MeenaRC4 11-0029_cells_found.573513.fcs,053012-MeenaRC4 11-0029_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,412433.0,27.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,SUB147387,Homo sapiens,Not Specified,,Not Specified,,Not Specified,Not Specified,,99,...,573807,CyTOF result,2318--RC4 1213 control_cells_found.573807.fcs,2318--RC4 1213 control_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,200180.0,27.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
549,SUB147387,Homo sapiens,Not Specified,,Not Specified,,Not Specified,Not Specified,,99,...,573808,CyTOF result,RC4 1213controlB_cells_found.573808.fcs,RC4 1213controlB_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,100833.0,27.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
551,SUB147387,Homo sapiens,Not Specified,,Not Specified,,Not Specified,Not Specified,,99,...,573809,CyTOF result,RC4-1213-1control_cells_found.573809.fcs,RC4-1213-1control_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,264809.0,27.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
553,SUB147387,Homo sapiens,Not Specified,,Not Specified,,Not Specified,Not Specified,,99,...,573810,CyTOF result,RC41213 control_cells_found.573810.fcs,RC41213 control_cells_found.fcs,/home/ubuntu/a/aging/data/ResultFiles/CyTOF_re...,/home/ubuntu/a/aging/data/ResultFiles/cytof-pi...,True,143618.0,27.0,"[CELL_LENGTH, DEAD, CD19, CD4, CD8, IGD, CD85J..."
