In [2]:
##### This is from https://github.com/eiriniar/CellCnn/blob/0413a9f49fe0831c8fe3280957fb341f9e028d2d/cellCnn/examples/NK_cell.ipynb
import os, sys, errno, glob
import numpy as np
import pandas as pd

import cellCnn
from cellCnn.utils import loadFCS, ftrans, mkdir_p, get_items
from cellCnn.model import CellCnn
from cellCnn.plotting import plot_results
from sklearn.metrics import roc_auc_score

%matplotlib inline

In [3]:
# define input and output directories
WDIR = os.path.join(cellCnn.__path__[0], 'examples')
FCS_DATA_PATH = os.path.join(WDIR, 'NK_cell_dataset/NK_cell_dataset', 'gated_NK')

# define output directory
OUTDIR = os.path.join(WDIR, 'output_NK_test')
mkdir_p(OUTDIR)

In [4]:
data_fcs = loadFCS(glob.glob(FCS_DATA_PATH + '/*.fcs')[0], transform=None, auto_comp=False)
print(data_fcs.channels)

  warn("text in segment does not start and end with delimiter")


['Time', 'Cell_length', 'CD3', 'Dead', '(La139)Dd', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2', 'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D', 'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1', 'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25', 'DNA1', 'DNA2']


In [5]:
# select the relevant markers for further analysis
markers = ['CD3', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2',
           'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D',
           'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1',
           'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25']
marker_idx = [data_fcs.channels.index(label) for label in markers]
nmark = len(markers)

In [6]:
# load the sample names and corresponding labels (0: CMV-, 1: CMV+), here from a CSV file
# prior CMV infection status is obtained from the original study (Horowitz et al. 2013)
csv_file = 'NK_fcs_samples_with_labels.csv'
fcs_info = np.array(pd.read_csv(csv_file, sep=','))
sample_ids = fcs_info[:, 0]
sample_labels = fcs_info[:, 1].astype(int)

In [7]:
# Here we randomly split the samples in training/validation/test sets.

# set random seed for reproducible results
np.random.seed(12345)

# cofactor for arcsinh transformation
cofactor = 5

# split the fcs files into training, validation and test set
group1 = np.where(sample_labels == 0)[0]
group2 = np.where(sample_labels == 1)[0]
l1, l2 = len(group1), len(group2)
ntrain_per_class = 7
ntest_group1 = l1 - ntrain_per_class
ntest_group2 = l2 - ntrain_per_class

# get the sample indices
train_idx1 = list(np.random.choice(group1, size=ntrain_per_class, replace=False))
test_idx1 = [i for i in group1 if i not in train_idx1]
train_idx2 = list(np.random.choice(group2, size=ntrain_per_class, replace=False))
test_idx2 = [i for i in group2 if i not in train_idx2]

# load the training samples
group1_list, group2_list = [], []
for idx in train_idx1:
    fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
    x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    group1_list.append(x)

for idx in train_idx2:
    fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
    x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    group2_list.append(x)

# load the test samples
t_group1_list, t_group2_list = [], []
test_phenotypes = []
for idx in test_idx1:
    fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
    x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    t_group1_list.append(x)
    test_phenotypes.append(0)

for idx in test_idx2:
    fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
    x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    t_group2_list.append(x)
    test_phenotypes.append(1)

# finally prepare training and vallidation data
cut = int(.8 * len(group1_list))
train_samples = group1_list[:cut] + group2_list[:cut]
train_phenotypes = [0] * len(group1_list[:cut]) + [1] * len(group2_list[:cut])
valid_samples = group1_list[cut:] + group2_list[cut:]
valid_phenotypes = [0] * len(group1_list[cut:]) + [1] * len(group2_list[cut:])
test_samples = t_group1_list + t_group2_list

  warn("text in segment does not start and end with delimiter")


In [8]:
#####################################################
# run a CellCnn analysis
#####################################################

model = CellCnn(ncell=200, nsubset=1000, verbose=0)

In [18]:
#####################################################
model.fit(train_samples=train_samples, train_phenotypes=train_phenotypes,
          valid_samples=valid_samples, valid_phenotypes=valid_phenotypes, outdir=OUTDIR)

KeyboardInterrupt: 

In [2]:
### predictions
# make predictions on the test cohort
test_pred = model.predict(test_samples)

# Each row in `test_pred` corresponds to a different sample
# and indicates the predicted class probabilities for that sample.
# Each row is a probability distribution and therefore always sums up to 1.
# Here we only have 2 classes: CMV- (1st column) and CMV+ (2nd column)

# look at the test set predictions
print('\nModel predictions:\n', test_pred)

# and the true phenotypes of the test samples
print('\nTrue phenotypes:\n', test_phenotypes)
test_pred

NameError: name 'model' is not defined

In [1]:
# calculate area under the ROC curve for the test set
test_auc = roc_auc_score(test_phenotypes, test_pred[:,1])
print(test_auc)

KeyboardInterrupt: 

In [12]:
# plot the results of the CellCnn analysis for the test samples in the output directory
_ = plot_results(model.results, test_samples, test_phenotypes,
                 markers, OUTDIR, filter_response_thres=0,
                 filter_diff_thres=0.2, group_a='CMV-', group_b='CMV+')
print('dond')

No handles with labels found to put in legend.


<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

In [None]:
#### 2nd plotting
_ = plot_results(model.results, test_samples, test_phenotypes,
                 markers, OUTDIR, filter_response_thres=0.7,
                 filter_diff_thres=0.2, group_a='CMV-', group_b='CMV+')
print('dond')
### hier läd er nur noch images rein