In [38]:
import importlib
import os
import cellCnn

importlib.reload(cellCnn)

import numpy as np
### (from: https://github.com/eiriniar/CellCnn/blob/0413a9f49fe0831c8fe3280957fb341f9e028d2d/cellCnn/examples/NK_cell_ungated.ipynb ) AND https://github.com/eiriniar/CellCnn/blob/0413a9f49fe0831c8fe3280957fb341f9e028d2d/cellCnn/examples/PBMC.ipynb
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

from cellCnn.ms.utils.helpers import get_chunks
from cellCnn.ms.utils.helpers import print_regression_model_stats
from cellCnn.plotting import plot_results
from cellCnn.utils import mkdir_p
from cellCnn.utils import save_results
from cellCnn.ms.utils.helpers import get_fitted_model
from cellCnn.ms.utils.helpers import split_test_valid_train
from cellCnn.ms.utils.helpers import calc_frequencies
#% pylab inline

In [4]:
##### state vars
cytokines = ['CCR2', 'CCR4', 'CCR6', 'CCR7', 'CXCR4', 'CXCR5', 'CD103', 'CD14', 'CD20', 'CD25', 'CD27', 'CD28', 'CD3',
             'CD4', 'CD45RA', 'CD45RO', 'CD56', 'CD57', 'CD69', 'CD8', 'TCRgd', 'PD.1', 'GM.CSF', 'IFN.g', 'IL.10',
             'IL.13', 'IL.17A', 'IL.2', 'IL.21', 'IL.22', 'IL.3', 'IL.4', 'IL.6', 'IL.9', 'TNF.a']
infile = 'cohort_denoised_clustered_diagnosis.csv'
indir = 'data/input'
outdir = 'out_ms_default'
rand_seed = 123
train_perc = 0.7
test_perc = 0.3
batch_size_pheno = 840  # so a size of 8425 is about equally sized in batches
batch_size_cd4 = 550  # so a size of 550 gets me 16 batches for cd4
## information from ms_data project
cluster_to_celltype_dict = {0: 'b', 1: 'cd4', 3: 'nkt', 8: 'cd8', 10: 'nk', 16: 'dg', 11: 'my'}

In [5]:
np.random.seed(rand_seed)
mkdir_p(outdir)
df = pd.read_csv(os.path.join(indir, infile), index_col=0)
df = df.drop_duplicates()  ### reduces overfitting at cost of fewer data
df.shape
##### no duplicates in

(16889, 37)

In [6]:
rrms_df = df[df['diagnosis'] == 'RRMS']
nindc_df = df[df['diagnosis'] == 'NINDC']
print(str(rrms_df.shape))
print(str(nindc_df.shape))

(8464, 37)
(8425, 37)


In [7]:
#### here we could see freq differences across the 2 groups

print('Frequencies: ')
rrms_freq_dict = calc_frequencies(rrms_df, cluster_to_celltype_dict)
print('\n')
nindc_freq_dict = calc_frequencies(nindc_df, cluster_to_celltype_dict)
print('\n')
whole_freq_dict = calc_frequencies(df, cluster_to_celltype_dict)

Frequencies: 
For 0 we got a freq. 0.1316162570888469
For 1 we got a freq. 0.5082703213610587
For 3 we got a freq. 0.01736767485822306
For 8 we got a freq. 0.24964555765595464
For 10 we got a freq. 0.047377126654064274
For 16 we got a freq. 0.02020321361058601
For 11 we got a freq. 0.02551984877126654


For 0 we got a freq. 0.13543026706231454
For 1 we got a freq. 0.5342433234421365
For 3 we got a freq. 0.02599406528189911
For 8 we got a freq. 0.22053412462908012
For 10 we got a freq. 0.044985163204747776
For 16 we got a freq. 0.01483679525222552
For 11 we got a freq. 0.02397626112759644


For 0 we got a freq. 0.13351885842856298
For 1 we got a freq. 0.5212268340339866
For 3 we got a freq. 0.021670910059802238
For 8 we got a freq. 0.23512345313517674
For 10 we got a freq. 0.046183906684824444
For 16 we got a freq. 0.01752620048552312
For 11 we got a freq. 0.024749837172123867


In [8]:
rrms_df = rrms_df.iloc[:nindc_df.shape[0], :]
print(str(rrms_df.shape))
print(str(nindc_df.shape))

(8425, 37)
(8425, 37)


In [9]:
rrms_idx_chunks = get_chunks(rrms_df.index, batch_size_pheno)
nindc_idx_chunks = get_chunks(nindc_df.index, batch_size_pheno)
len(nindc_idx_chunks)

11

In [10]:
stack_idx_chunks = []
for rrms, nindc in zip(rrms_idx_chunks, nindc_idx_chunks):
    stack_idx_chunks.append((rrms, 0))
    stack_idx_chunks.append((nindc, 1))
shuffle(stack_idx_chunks)  # shuffe is 'INPLACE'

[(Int64Index([1201511,  138221, 2775103,  118967,  320432,  109480,  185395,
              2040047, 1181786,  162714,
              ...
              2842146, 2137184, 1958741, 2032005,  735632,  201583,  940126,
              2042091, 2050282, 1203227],
             dtype='int64', length=840),
  0),
 (Int64Index([2660241,  726884, 1085359, 1742565, 1715445, 2588587, 1729891,
               572947, 2689804,  810732,
              ...
              1115558,  704914, 2600556, 1089869,  390151, 2659485, 2643933,
               428000,  499008, 2614146],
             dtype='int64', length=840),
  1),
 (Int64Index([1659746,  696280, 2662093, 1849652, 1099195, 1629725, 2674068,
              2679809, 1763401, 1618940,
              ...
               589333,  556852, 1690054,  598196, 2677804, 2587309, 2704851,
               592662,  592453,  616435],
             dtype='int64', length=840),
  1),
 (Int64Index([ 429380, 1744888,  572428, 2594139, 2682886, 2598260,  522967,
               86

In [11]:
stack_chunks_celltypes = dict()  ### there i, additionally want to get the cell type belonging junks
for cluster in cluster_to_celltype_dict.keys():
    stack_chunks_celltypes[cluster] = []

stack_chucks_trains = []
stack_chucks_phenos = []
for idx, pheno in stack_idx_chunks:
    if idx[0] in list(rrms_df.index):
        stack_chuck = rrms_df.loc[idx, :]
        stack_chucks_phenos.append(0)  # 0 for rrms
    elif idx[0] in list(nindc_df.index):
        stack_chuck = nindc_df.loc[idx, :]
        stack_chucks_phenos.append(1)  # 1 for nindc
    else:
        print(f'ERROR: {idx} is in no DataFrame. This should not be possible')
    stack_chuck_train = stack_chuck.drop(columns=['cluster', 'diagnosis'])
    stack_chucks_trains.append(stack_chuck_train)

# kFold it ? when we see cell type frequencies with only few data

In [121]:
X_test, X_train, X_valid, y_test, y_train, y_valid = split_test_valid_train(
    X=stack_chucks_trains,
    y=stack_chucks_phenos,
    test_perc=test_perc,
    train_perc=train_perc,
    valid_perc=0.5, seed=rand_seed)

In [122]:
outdir_pheno = 'ms_pheno_class'
model = get_fitted_model(X_train, X_valid, y_train, y_valid,
                         nrun=15, ncell=200, nsubset=batch_size_pheno,
                         nfilters=[3, 15, 37], coeff_l1=0,
                         coeff_l2=1e-4, max_epochs=75, learning_rate=0.002,
                         per_sample=False, regression=False,
                         outdir=outdir_pheno)
#### NOTE:
#### it performs Strat.KFold anyways (even if i dont manually divide them into train valids... )
#### bad thing is that i dont have any test yet
#### made Errors:
#### 1. I put in y_unique as the true class labels (no integer in a range) and got a list indes out of bound

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/7

In [130]:
results = model.results
scaler = results['scaler']
filters = results['w_best_net']
save_results(results, outdir, cytokines)

In [129]:
# # Model predictions: to beat
#  [[0.06793241 0.93206759]
#  [0.95944599 0.04055399]
#  [0.05254236 0.94745767]
#  [0.02734859 0.9726514 ]]
test_pred = model.predict(X_test)
#print_regression_model_stats(test_pred, y_test)
print(f'ROC AUC : {roc_auc_score(y_test, test_pred[:,1])}')
print(f'Pred: {test_pred}')
print(f'Obs: {y_test}')
# This is classification. ..
# calculate area under the ROC curve for the test set


ROC AUC : 1.0
Pred: [[0.2353084  0.76469159]
 [0.26144935 0.73855064]
 [0.17473738 0.82526263]
 [0.81883826 0.18116173]]
Obs: [1, 1, 1, 0]


In [131]:
# plot the results of the CellCnn analysis for the test samples in the output directory
_ = plot_results(model.results, X_test, y_test,
                 cytokines, outdir + '/plots', filter_response_thres=0,
                 filter_diff_thres=0.2, group_a='RRMS', group_b='NINDC')
print('done')
### plotting stuff

done


In [3]:
#####################################################################
#####################################################################
################### First try of cd4 T cell freq. ###################
#####################################################################
#####################################################################

In [12]:
celltype_dfs = dict()
grouped_df = df.groupby('cluster')
for cluster in cluster_to_celltype_dict.keys():
    celltype_dfs[cluster] = [grouped_df.get_group(cluster)]

for val in celltype_dfs.values():  ### crosscheck with my ms_data set
    print(len(val[0]))
### since my date for CD4 T cells is biggest i start with group 1

2255
8803
366
3971
780
296
418


In [13]:
selection = 1
cd4_df = celltype_dfs[selection][0]

In [16]:
cd4_idx_chunks = get_chunks(cd4_df.index, batch_size_cd4)

In [19]:
counter_vals = []
for key, val in celltype_dfs.items():
    if key != selection:
        counter_vals.append(val[0])

counter_df = pd.concat(counter_vals)
counter_idx_chunks = get_chunks(counter_df.index, batch_size_cd4)
len(counter_idx_chunks)

15

In [18]:
freq = [whole_freq_dict[selection] for i in range(len(cd4_idx_chunks))] + [0 for i in range(len(counter_idx_chunks))]
df_reduced = df.drop(columns=['diagnosis', 'cluster'])
cd4_X = [df_reduced.loc[cd4_idx, :] for cd4_idx in cd4_idx_chunks]
chunks = [df_reduced.loc[counter_idx, :] for counter_idx in counter_idx_chunks]
cd4_X = cd4_X + chunks
cd4_X

NameError: name 'counter_idx_chunks' is not defined

In [36]:
X_test_cd4, X_train_cd4, X_valid_cd4, y_test_cd4, y_train_cd4, y_valid_cd4 = split_test_valid_train(X=cd4_X, y=freq,
                                                                                                    test_perc=test_perc,
                                                                                                    train_perc=train_perc,
                                                                                                    valid_perc=0.5,
                                                                                                    seed=rand_seed)

In [118]:
outdir = 'out_ms_cd4_2'
model_cd4 = get_fitted_model(X_train=X_train_cd4, X_valid=X_valid_cd4,
                             y_train=y_train_cd4, y_valid=y_valid_cd4,
                             nrun=15, ncell=200, nsubset=batch_size_cd4,
                             nfilters=[3, 15, 36], coeff_l2=1e-5, coeff_l1=None,
                             max_epochs=50, outdir=outdir,
                             regression=True, per_sample=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_0.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_1.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_2.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_3.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_4.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_5.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_6.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_7.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_8.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_9.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_10.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_11.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_12.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_13.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_14.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


KeyError: 14

In [119]:
results = model_cd4.results
scaler = results['scaler']
filters = results['w_best_net']
save_results(results, outdir, cytokines)

NameError: name 'model_cd4' is not defined

In [120]:
# # Model predictions: to beat
#model = CellCnn(nrun=15, ncell=200, nsubset=batch_size_cd4, nfilter_choice=[35],
#                coeff_l2=1e-5, max_epochs=50, per_sample=Tru
#  [[0.5068497 ]
#  [0.51000313]
#  [0.07073919]
#  [0.02683752]
#  [0.51068987]]
test_pred_cd4 = model_cd4.predict(X_test_cd4)
print_regression_model_stats(test_pred_cd4, y_test_cd4)

NameError: name 'model_cd4' is not defined

In [68]:
# plot the results of the CellCnn analysis for the test samples in the output directory
_ = plot_results(model_cd4.results, X_test_cd4, y_test_cd4,
                 cytokines, outdir + '/plots', filter_response_thres=0,
                 filter_diff_thres=0.2, group_a='RRMS', group_b='NINDC')
print('done')

KeyError: 1

In [None]:
#####################################################################
#####################################################################
################### First try of MTL (pheno + cd4) T cell freq. ###################
#####################################################################
#####################################################################
# from: https://machinelearningmastery.com/neural-network-models-for-combined-classification-and-regression/#:~:text=Regression%20refers%20to%20predictive%20modeling,labels%20for%20a%20given%20input.

In [20]:
cd4_df.head() ## from above
cd4_df_rrms = cd4_df[cd4_df['diagnosis'] == 'RRMS']
cd4_df_nindc = cd4_df[cd4_df['diagnosis'] == 'NINDC']
cd4_df_rrms_chunks_idx = get_chunks(cd4_df_rrms.index, batch_size_cd4)
cd4_df_nindc_chunks_idx = get_chunks(cd4_df_nindc.index, batch_size_cd4)

In [21]:
counter_df.head() ## from above
counter_df_rrms = counter_df[counter_df['diagnosis'] == 'RRMS']
counter_df_nindc = counter_df[counter_df['diagnosis'] == 'NINDC']
counter_df_rrms_chunks_idx = get_chunks(counter_df_rrms.index, batch_size_cd4)
counter_df_nindc_chunks_idx = get_chunks(counter_df_nindc.index, batch_size_cd4)

In [25]:
freq_pheno_rrms = [(whole_freq_dict[selection], 1) for i in range(len(cd4_df_rrms_chunks_idx))] + [(0, 1) for i in range(len(counter_df_rrms_chunks_idx))]
freq_pheno_nindc = [(whole_freq_dict[selection], 0) for i in range(len(cd4_df_nindc_chunks_idx))] + [(0, 0) for i in range(len(counter_df_nindc_chunks_idx))]
df_reduced = df.drop(columns=['diagnosis', 'cluster'])
cd4_rrms_X = [df_reduced.loc[cd4_idx, :] for cd4_idx in cd4_df_rrms_chunks_idx]
cd4_nindc_X = [df_reduced.loc[cd4_idx, :] for cd4_idx in cd4_df_nindc_chunks_idx]
chunks_rrms = [df_reduced.loc[counter_idx, :] for counter_idx in counter_df_rrms_chunks_idx]
chunks_nindc = [df_reduced.loc[counter_idx, :] for counter_idx in counter_df_nindc_chunks_idx]
cd4_mtl_X = cd4_rrms_X + chunks_rrms + cd4_nindc_X + chunks_nindc
cd4_pheno_mtl_y = freq_pheno_rrms + freq_pheno_nindc

In [28]:
X_test_mtl_1, X_train_mtl_1, X_valid_mtl_1, y_test_mtl_1, y_train_mtl_1, y_valid_mtl_1 = split_test_valid_train(X=cd4_mtl_X, y=cd4_pheno_mtl_y,
                                                                                                    test_perc=test_perc,
                                                                                                    train_perc=train_perc,
                                                                                                    valid_perc=0.5,
                                                                                                    seed=rand_seed)


In [33]:
[item[0] for item in y_test_mtl_1]

[0.5212268340339866, 0, 0.5212268340339866, 0, 0.5212268340339866]

In [34]:
[item[1] for item in y_test_mtl_1]

[0, 1, 1, 0, 0]

In [69]:
importlib.reload(cellCnn.ms.utils.helpers)
importlib.reload(cellCnn.model)
from cellCnn.ms.utils.helpers import *
from cellCnn.model import CellCnn


outdir = 'out_ms_mtl_cd4_pheno'
model_cd4 = get_fitted_model(X_train=X_train_mtl_1, X_valid=X_valid_mtl_1,
                             y_train=[[item[0] for item in y_train_mtl_1], [item[1] for item in y_train_mtl_1]],
                             y_valid=[[item[0] for item in y_valid_mtl_1], [item[1] for item in y_valid_mtl_1]],
                             nrun=15, ncell=200, nsubset=batch_size_cd4,
                             nfilters=[3, 15, 36], coeff_l2=1e-5, coeff_l1=None,
                             max_epochs=50, outdir=outdir,
                             regression=True, per_sample=True)


An exception was raised during training the network.
Data cardinality is ambiguous:
  x sizes: 12650, 1090
  y sizes: 12650, 1090
Please provide data which shares the same first dimension.
An exception was raised during training the network.
Data cardinality is ambiguous:
  x sizes: 12650, 1090
  y sizes: 12650, 1090
Please provide data which shares the same first dimension.
