In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.pipeline import Pipeline
random_state = 7
from sklearn.impute import SimpleImputer, KNNImputer
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense

In [2]:
X = joblib.load('./AML_data/meth.pkl')
phenodf = joblib.load('./AML_data/pheno.pkl')


In [3]:
unionindices = joblib.load('./AML_data/unionindices.pkl')
len(unionindices)

1300

In [4]:
phenodf.genotype.unique()

array(['normal', 't(11;19)', nan, 'mono 7', 'inv(16)', 't(9;11)',
       'other clon abn', 'other 11q23/MLL', 't(10;11)', '3q21q26',
       't(8;21)', 'sole+8', 't(15;17)', 'no result'], dtype=object)

In [5]:
len(phenodf.genotype.unique())

14

In [6]:
# Create a new column for genotypes to merge some groups together

# Nas, No result and other will form one group

In [7]:
mll = ['other 11q23/MLL', 't(9;11)', 't(10;11)','t(11;19)']
other = ['normal', 'mono 7', 'inv(16)','other clon abn', '3q21q26', 't(8;21)', 'sole+8', 't(15;17)']

In [8]:
finalgenotype = []

for data in phenodf.genotype:

    if data in mll:
        finalgenotype.append('MLL rearranged')
        
    elif data == 'no result':
        finalgenotype.append('No result')
        
    elif data in other:
        finalgenotype.append(data)
        
    else:
        finalgenotype.append('No result')
        

In [9]:
finalgenotype

['normal',
 'MLL rearranged',
 'normal',
 'No result',
 'mono 7',
 'inv(16)',
 'normal',
 'normal',
 'MLL rearranged',
 'other clon abn',
 'other clon abn',
 'MLL rearranged',
 'normal',
 'MLL rearranged',
 'MLL rearranged',
 'MLL rearranged',
 'other clon abn',
 'MLL rearranged',
 'normal',
 'No result',
 'MLL rearranged',
 'normal',
 'MLL rearranged',
 '3q21q26',
 'normal',
 'other clon abn',
 'other clon abn',
 'inv(16)',
 'No result',
 't(8;21)',
 'sole+8',
 'sole+8',
 'normal',
 'normal',
 'No result',
 'other clon abn',
 'mono 7',
 't(8;21)',
 'normal',
 'inv(16)',
 'No result',
 't(8;21)',
 'MLL rearranged',
 'MLL rearranged',
 'normal',
 'No result',
 'normal',
 't(15;17)',
 'No result',
 'other clon abn',
 'normal',
 'other clon abn',
 'inv(16)',
 'inv(16)',
 't(8;21)',
 'MLL rearranged',
 't(8;21)',
 't(8;21)',
 'normal',
 'normal',
 't(8;21)',
 't(15;17)',
 'No result',
 'MLL rearranged',
 'MLL rearranged',
 'MLL rearranged',
 'inv(16)',
 'No result',
 't(8;21)',
 'No result

In [10]:
phenodf['finalgenotype'] = finalgenotype

In [11]:
phenodf[['genotype', 'finalgenotype']].head(20)

Unnamed: 0_level_0,genotype,finalgenotype
public_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AML_001,normal,normal
AML_002,t(11;19),MLL rearranged
AML_003,normal,normal
AML_004_r,,No result
AML_005,mono 7,mono 7
AML_006,inv(16),inv(16)
AML_007,normal,normal
AML_008,normal,normal
AML_009,t(9;11),MLL rearranged
AML_010,other clon abn,other clon abn


In [12]:
len(phenodf.finalgenotype.unique())

10

In [13]:
phenodf.finalgenotype.value_counts()

normal            30
MLL rearranged    25
other clon abn    23
No result         20
t(8;21)           19
inv(16)           12
mono 7             5
t(15;17)           4
sole+8             3
3q21q26            1
Name: finalgenotype, dtype: int64

In [14]:
unknowndf = phenodf[phenodf['finalgenotype'].isin(['No result', 'other clon abn'])]

In [15]:
Xun = X[X.index.get_level_values(1).isin(unknowndf.index)]

In [16]:
phenodf = phenodf[~phenodf['finalgenotype'].isin(['No result', 'other clon abn'])]

In [17]:
phenodf.finalgenotype.value_counts()

normal            30
MLL rearranged    25
t(8;21)           19
inv(16)           12
mono 7             5
t(15;17)           4
sole+8             3
3q21q26            1
Name: finalgenotype, dtype: int64

In [18]:
X = X[X.index.get_level_values(1).isin(phenodf.index)]

In [19]:
X.shape

(99, 406830)

In [20]:
Xtrain, Xtest, ytrain, ytest= train_test_split(X, phenodf.finalgenotype, test_size = 0.33, random_state = 6)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

(66, 406830) (33, 406830) (66,) (33,)


In [21]:
Xtrain = Xtrain[unionindices]
Xtest = Xtest[unionindices]

In [22]:
Xtrain.shape

(66, 1300)

In [23]:
Xtest.shape

(33, 1300)

In [24]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(Xtrain)
XtrainN = pd.DataFrame(imp.transform(Xtrain), columns = Xtrain.columns, index = Xtrain.index)
XtestN = pd.DataFrame(imp.transform(Xtest), columns = Xtest.columns, index = Xtest.index)


In [25]:
ytrain = pd.DataFrame(ytrain, columns = ['finalgenotype'], index = ytrain.index)
ytest = pd.DataFrame(ytest, columns = ['finalgenotype'], index = ytest.index)


In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
encoder = LabelEncoder()
encoder.fit(ytrain)
encoded_ytrain = encoder.transform(ytrain)
encoded_ytest = encoder.transform(ytest)

dummy_ytrain = np_utils.to_categorical(encoded_ytrain)
dummy_yest = np_utils.to_categorical(encoded_ytest)


  return f(**kwargs)


In [28]:
ytrain['classes'] = encoded_ytrain
ytest['classes'] = encoded_ytest

In [29]:
ytrain

Unnamed: 0_level_0,finalgenotype,classes
public_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AML_110,inv(16),2
AML_051,inv(16),2
AML_100,t(8;21),7
AML_015,MLL rearranged,1
AML_123,inv(16),2
...,...,...
AML_047,t(8;21),7
AML_109,mono 7,3
AML_104,normal,4
AML_065,normal,4


### Call the baseline model function defined during training to populate it with the saved pipeline hyperparameters

In [30]:
def baseline_model(input_dim = len(unionindices),
                   out = len(phenodf.finalgenotype.unique()),
                   activation = 'sigmoid',
                   dim1 = 100,
                   dim2 = 50,
                   optimizer = 'rmsprop'):
    model = Sequential()
    model.add(Dense(dim1, input_dim=input_dim, activation=activation)) 
    model.add(Dense(dim2, input_dim=dim1, activation=activation))
    model.add(Dense(out, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [31]:
from keras.models import load_model
# Load the Pipeline & then add the Neural Network Steps

bestmodel = joblib.load('./AML_data/sklearn_pipelineDLClassification.pkl')
bestmodel.named_steps['NeuralNetwork'].model = load_model('./AML_data/DLSubtype_ClassificationHyperoptAlldata.h5')                      



In [32]:
bestmodel.get_params()

{'memory': None,
 'steps': [('NeuralNetwork',
   <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x2254959b7f0>)],
 'verbose': False,
 'NeuralNetwork': <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier at 0x2254959b7f0>,
 'NeuralNetwork__verbose': 1,
 'NeuralNetwork__activation': 'sigmoid',
 'NeuralNetwork__batch_size': 8,
 'NeuralNetwork__dim1': 200,
 'NeuralNetwork__epochs': 100,
 'NeuralNetwork__build_fn': <function __main__.baseline_model(input_dim=1300, out=8, activation='sigmoid', dim1=100, dim2=50, optimizer='rmsprop')>}

### Permutation Analysis to identify whether the model performs better with real data rather than random generated data to support the hypothesis that there is real dependency between DNA methylation data and cytogenetic subtype

In [33]:
from sklearn.model_selection import permutation_test_score

In [34]:
# Permutation takes very long time #

In [35]:
#joblib.dump([score, perm_scores, pvalue], 'permutationscoresClassificationDL.pkl')