In [110]:
import importlib
import os
import cellCnn

importlib.reload(cellCnn.ms.utils.helpers)

import numpy as np
### (from: https://github.com/eiriniar/CellCnn/blob/0413a9f49fe0831c8fe3280957fb341f9e028d2d/cellCnn/examples/NK_cell_ungated.ipynb ) AND https://github.com/eiriniar/CellCnn/blob/0413a9f49fe0831c8fe3280957fb341f9e028d2d/cellCnn/examples/PBMC.ipynb
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

from cellCnn.ms.utils.helpers import get_chunks
from cellCnn.ms.utils.helpers import print_regression_model_stats
from cellCnn.plotting import plot_results
from cellCnn.utils import mkdir_p
from cellCnn.utils import save_results
from cellCnn.ms.utils.helpers import get_fitted_model
from cellCnn.ms.utils.helpers import split_test_valid_train
from cellCnn.ms.utils.helpers import calc_frequencies

#% pylab inline

In [92]:
##### state vars
cytokines = ['CCR2', 'CCR4', 'CCR6', 'CCR7', 'CXCR4', 'CXCR5', 'CD103', 'CD14', 'CD20', 'CD25', 'CD27', 'CD28', 'CD3',
             'CD4', 'CD45RA', 'CD45RO', 'CD56', 'CD57', 'CD69', 'CD8', 'TCRgd', 'PD.1', 'GM.CSF', 'IFN.g', 'IL.10',
             'IL.13', 'IL.17A', 'IL.2', 'IL.21', 'IL.22', 'IL.3', 'IL.4', 'IL.6', 'IL.9', 'TNF.a']
infile = 'cohort_denoised_clustered_diagnosis.csv'
indir = 'data/input'
outdir = 'out_ms_default'
rand_seed = 123
train_perc = 0.7
test_perc = 0.3
batch_size_pheno = 840  # so a size of 8425 is about equally sized in batches
batch_size_cd4 = 550  # so a size of 550 gets me 16 batches for cd4
## information from ms_data project
cluster_to_celltype_dict = {0: 'b', 1: 'cd4', 3: 'nkt', 8: 'cd8', 10: 'nk', 16: 'dg', 11: 'my'}

In [93]:
np.random.seed(rand_seed)
mkdir_p(outdir)
df = pd.read_csv(os.path.join(indir, infile), index_col=0)
df = df.drop_duplicates()  ### reduces overfitting at cost of fewer data
df.shape
##### no duplicates in

(16889, 37)

In [94]:
rrms_df = df[df['diagnosis'] == 'RRMS']
nindc_df = df[df['diagnosis'] == 'NINDC']
print(str(rrms_df.shape))
print(str(nindc_df.shape))

(8464, 37)
(8425, 37)


In [95]:
#### here we could see freq differences across the 2 groups

print('Frequencies: ')
rrms_freq_dict = calc_frequencies(rrms_df, cluster_to_celltype_dict)
print('\n')
nindc_freq_dict = calc_frequencies(nindc_df, cluster_to_celltype_dict)
print('\n')
whole_freq_dict = calc_frequencies(df, cluster_to_celltype_dict)

Frequencies: 
For 0 we got a freq. 0.1316162570888469
For 1 we got a freq. 0.5082703213610587
For 3 we got a freq. 0.01736767485822306
For 8 we got a freq. 0.24964555765595464
For 10 we got a freq. 0.047377126654064274
For 16 we got a freq. 0.02020321361058601
For 11 we got a freq. 0.02551984877126654


For 0 we got a freq. 0.13543026706231454
For 1 we got a freq. 0.5342433234421365
For 3 we got a freq. 0.02599406528189911
For 8 we got a freq. 0.22053412462908012
For 10 we got a freq. 0.044985163204747776
For 16 we got a freq. 0.01483679525222552
For 11 we got a freq. 0.02397626112759644


For 0 we got a freq. 0.13351885842856298
For 1 we got a freq. 0.5212268340339866
For 3 we got a freq. 0.021670910059802238
For 8 we got a freq. 0.23512345313517674
For 10 we got a freq. 0.046183906684824444
For 16 we got a freq. 0.01752620048552312
For 11 we got a freq. 0.024749837172123867


In [96]:
rrms_df = rrms_df.iloc[:nindc_df.shape[0], :]
print(str(rrms_df.shape))
print(str(nindc_df.shape))

(8425, 37)
(8425, 37)


In [97]:
rrms_idx_chunks = get_chunks(rrms_df.index, batch_size_pheno)
nindc_idx_chunks = get_chunks(nindc_df.index, batch_size_pheno)
len(nindc_idx_chunks)

11

In [106]:
stack_idx_chunks = []
for rrms, nindc in zip(rrms_idx_chunks, nindc_idx_chunks):
    stack_idx_chunks.append((rrms, 0))
    stack_idx_chunks.append((nindc, 1))
shuffle(stack_idx_chunks)  # shuffe is 'INPLACE'

[(Int64Index([2130721,   83635,   16125, 1341777,  253417,  339502,  100204,
              2770830, 1142431, 2005928,
              ...
               198834, 1046320,  346988,  335840, 2796814,  139906,  224397,
              1991341, 1320490,  331284],
             dtype='int64', length=840),
  0),
 (Int64Index([1678979, 1087306, 1641308,  573502, 2704174, 1661904,   45698,
              2693507,  409185, 1100736, 1127404, 1091503, 1818252, 1820329,
              2651403, 2675641, 2593433,  505701,   45551,  637483, 1859953,
              2571066, 2591174,  628134,  597698],
             dtype='int64'),
  1),
 (Int64Index([1939394, 1996579,  461308, 2801945, 2126329,  344531, 1338281,
               362024, 2802633,  191767,
              ...
              1071526, 2129413, 2787654, 1052595, 1142933, 2771911,  372269,
              1013717, 2145312, 2152065],
             dtype='int64', length=840),
  0),
 (Int64Index([ 241689, 2169586, 2845938,   16321, 2144737, 1049340,  927018,
  

In [107]:
stack_chucks_trains = []
stack_chunks_celltypes = dict()  ### there i, additionally want to get the cell type belonging junks
for cluster in cluster_to_celltype_dict.keys():
    stack_chunks_celltypes[cluster] = []

stack_chucks_phenos = []
for idx, pheno in stack_idx_chunks:
    if idx[0] in list(rrms_df.index):
        stack_chuck = rrms_df.loc[idx, :]
        stack_chucks_phenos.append(0)  # 0 for rrms
    elif idx[0] in list(nindc_df.index):
        stack_chuck = nindc_df.loc[idx, :]
        stack_chucks_phenos.append(1)  # 1 for nindc
    else:
        print(f'ERROR: {idx} is in no DataFrame. This should not be possible')
    stack_chuck_train = stack_chuck.drop(columns=['cluster', 'diagnosis'])
    stack_chucks_trains.append(stack_chuck_train)

# kFold it ? when we see cell type frequencies with only few data

In [121]:
X_test, X_train, X_valid, y_test, y_train, y_valid = split_test_valid_train(
    X=stack_chucks_trains,
    y=stack_chucks_phenos,
    test_perc=test_perc,
    train_perc=train_perc,
    valid_perc=0.5, seed=rand_seed)

In [122]:
outdir_pheno = 'ms_pheno_class'
model = get_fitted_model(X_train, X_valid, y_train, y_valid,
                         nrun=15, ncell=200, nsubset=batch_size_pheno,
                         nfilters=[3, 15, 37], coeff_l1=0,
                         coeff_l2=1e-4, max_epochs=75, learning_rate=0.002,
                         per_sample=False, regression=False,
                         outdir=outdir_pheno)
#### NOTE:
#### it performs Strat.KFold anyways (even if i dont manually divide them into train valids... )
#### bad thing is that i dont have any test yet
#### made Errors:
#### 1. I put in y_unique as the true class labels (no integer in a range) and got a list indes out of bound

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/7

In [130]:
results = model.results
scaler = results['scaler']
filters = results['w_best_net']
save_results(results, outdir, cytokines)

In [129]:
# # Model predictions: to beat
#  [[0.06793241 0.93206759]
#  [0.95944599 0.04055399]
#  [0.05254236 0.94745767]
#  [0.02734859 0.9726514 ]]
test_pred = model.predict(X_test)
#print_regression_model_stats(test_pred, y_test)
print(f'ROC AUC : {roc_auc_score(y_test, test_pred[:,1])}')
print(f'Pred: {test_pred}')
print(f'Obs: {y_test}')
# This is classification. ..
# calculate area under the ROC curve for the test set


ROC AUC : 1.0
Pred: [[0.2353084  0.76469159]
 [0.26144935 0.73855064]
 [0.17473738 0.82526263]
 [0.81883826 0.18116173]]
Obs: [1, 1, 1, 0]


In [131]:
# plot the results of the CellCnn analysis for the test samples in the output directory
_ = plot_results(model.results, X_test, y_test,
                 cytokines, outdir + '/plots', filter_response_thres=0,
                 filter_diff_thres=0.2, group_a='RRMS', group_b='NINDC')
print('done')
### plotting stuff

done


In [3]:
#####################################################################
#####################################################################
################### First try of cd4 T cell freq. ###################
#####################################################################
#####################################################################

In [18]:
celltype_dfs = dict()
grouped_df = df.groupby('cluster')
for cluster in cluster_to_celltype_dict.keys():
    celltype_dfs[cluster] = [grouped_df.get_group(cluster)]

for val in celltype_dfs.values():  ### crosscheck with my ms_data set
    print(len(val[0]))
### since my date for CD4 T cells is biggest i start with group 1

2255
8803
366
3971
780
296
418


In [19]:
selection = 1
cd4_df = celltype_dfs[selection][0]

In [20]:
cd4_idx_chunks = get_chunks(cd4_df.index, batch_size_cd4)

In [21]:
counter_vals = []
for key, val in celltype_dfs.items():
    if key != selection:
        counter_vals.append(val[0])

counter_df = pd.concat(counter_vals)
counter_idx_chunks = get_chunks(counter_df.index, batch_size_cd4)
len(counter_idx_chunks)

15

In [35]:
freq = [whole_freq_dict[selection] for i in range(len(cd4_idx_chunks))] + [0 for i in range(len(counter_idx_chunks))]
df_reduced = df.drop(columns=['diagnosis', 'cluster'])
cd4_X = [df_reduced.loc[cd4_idx, :] for cd4_idx in cd4_idx_chunks]
chunks = [df_reduced.loc[counter_idx, :] for counter_idx in counter_idx_chunks]
cd4_X = cd4_X + chunks
cd4_X

[             CD69     CXCR5      CCR2      CD57      CD28      IL.9      IL.4  \
 1870465  0.000000  0.000000  0.000000  0.017629  0.020329  0.000000  0.000000   
 2169586  0.000000  0.000000  0.000000  0.021880  0.000000  0.014239  0.000000   
 2845938  0.041198  0.010143  0.020405  0.000000  0.039559  0.014050  0.000000   
 16321    0.010868  0.000000  0.000000  0.025924  0.128853  0.008690  0.003272   
 827587   0.011048  0.000000  0.035972  0.000000  0.129387  0.024206  0.000000   
 ...           ...       ...       ...       ...       ...       ...       ...   
 2691981  0.008730  0.011940  0.002169  0.000000  0.073074  0.000000  0.000000   
 1181678  0.060062  0.011255  0.000000  0.012367  0.100911  0.022696  0.000000   
 1990803  0.008278  0.003599  0.000000  0.020286  0.108406  0.005550  0.006172   
 1775692  0.000000  0.004901  0.034441  0.030920  0.121199  0.000000  0.000000   
 949149   0.000000  0.000000  0.017712  0.017733  0.013348  0.039937  0.009938   
 
              

In [36]:
X_test_cd4, X_train_cd4, X_valid_cd4, y_test_cd4, y_train_cd4, y_valid_cd4 = split_test_valid_train(X=cd4_X, y=freq,
                                                                                                    test_perc=test_perc,
                                                                                                    train_perc=train_perc,
                                                                                                    valid_perc=0.5,
                                                                                                    seed=rand_seed)

In [118]:
outdir = 'out_ms_cd4_2'
model_cd4 = get_fitted_model(X_train=X_train_cd4, X_valid=X_valid_cd4,
                             y_train=y_train_cd4, y_valid=y_valid_cd4,
                             nrun=15, ncell=200, nsubset=batch_size_cd4,
                             nfilters=[3, 15, 36], coeff_l2=1e-5, coeff_l1=None,
                             max_epochs=50, outdir=outdir,
                             regression=True, per_sample=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_0.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_1.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_2.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_3.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_4.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_5.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_6.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_7.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_8.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_9.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_10.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_11.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_12.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_13.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


An exception was raised during training the network.
Unable to open file (unable to open file: name = 'out_ms_cd4_2/nnet_run_14.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


KeyError: 14

In [119]:
results = model_cd4.results
scaler = results['scaler']
filters = results['w_best_net']
save_results(results, outdir, cytokines)

NameError: name 'model_cd4' is not defined

In [120]:
# # Model predictions: to beat
#model = CellCnn(nrun=15, ncell=200, nsubset=batch_size_cd4, nfilter_choice=[35],
#                coeff_l2=1e-5, max_epochs=50, per_sample=Tru
#  [[0.5068497 ]
#  [0.51000313]
#  [0.07073919]
#  [0.02683752]
#  [0.51068987]]
test_pred_cd4 = model_cd4.predict(X_test_cd4)
print_regression_model_stats(test_pred_cd4, y_test_cd4)

NameError: name 'model_cd4' is not defined

In [68]:
# plot the results of the CellCnn analysis for the test samples in the output directory
_ = plot_results(model_cd4.results, X_test_cd4, y_test_cd4,
                 cytokines, outdir + '/plots', filter_response_thres=0,
                 filter_diff_thres=0.2, group_a='RRMS', group_b='NINDC')
print('done')

KeyError: 1