In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


Simple demo neural network classifier:

In [3]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense

num_classes = 2

def build_classifier(num_classes, num_dimensions=1781):

    classifier = Sequential([
        Dense(1024, activation='relu', input_shape=(num_dimensions, )),
        Dense(128, activation='relu'),
        Dense(16, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    classifier.compile(optimizer='adam',
                       loss='categorical_crossentropy',
                       metrics=['accuracy',])
    return classifier

In [4]:
import numpy as np

with open("ref_data/SRR_47normal_small.list") as normal_subjects_file:
    healthy_subjects = [line.strip() for line in normal_subjects_file]

with open("ref_data/SRR_50aml_small.list") as diseased_subjects_file:
    diseased_subjects = [line.strip() for line in diseased_subjects_file]

In [5]:
print(len(healthy_subjects), len(diseased_subjects))

47 50


In [6]:
def load_data(subjects_array, subdirectory="counts_DP8_dbsnp_GQ20_DP50_GQ90/"):
    a = []
    for person in subjects_array:
        try:
            with open(subdirectory + person + ".count") as person_file:
                vector = [line.strip().split()[-1] for line in person_file]
                a.append(np.array(vector))
        except FileNotFoundError:
            continue
    return a

X_healthy = load_data(healthy_subjects)
y_healthy = [0. for _ in range(len(X_healthy))]
X_diseased = load_data(diseased_subjects)
y_diseased = [1. for _ in range(len(X_diseased))]

In [7]:
print(len(X_healthy), len(X_diseased))
print(len(y_healthy), len(y_diseased))

30 47
30 47


In [8]:
X = np.array(X_healthy + X_diseased)
y = np.array(y_healthy + y_diseased)

In [9]:
num_people = len(X)

One-Hot Encode Output Labels:

In [10]:
from tensorflow.python.keras.utils import to_categorical

y1hot = to_categorical(y, num_classes=num_classes)

## K-Fold Cross Validation
Since we don't have many people represented in our data, we'll do leave-one-out cross validation:

In [11]:
from sklearn.model_selection import KFold
from tensorflow.python.keras.callbacks import ModelCheckpoint

kf = KFold(n_splits=num_people)
fold_number = 1
cvscores = []
for train_index, val_index in kf.split(X):
    print("Fold " + str(fold_number))
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y1hot[train_index], y1hot[val_index]
    classifier = build_classifier(num_classes=num_classes)
    filename = "fold{}.weights.hdf5".format(fold_number)
    model_checkpointer = ModelCheckpoint(filepath=filename, verbose=1, save_best_only=True)
    classifier.fit(X_train, y_train, batch_size=num_people, epochs=8, validation_data=(X_val, y_val), verbose=2, callbacks=[model_checkpointer])
    classifier.load_weights(filename)
    scores = classifier.evaluate(X_val, y_val, verbose=0)
    print(classifier.metrics_names)
    print(scores)
    cvscores.append(scores)
    del classifier
    fold_number += 1

Fold 1
Train on 76 samples, validate on 1 samples
Epoch 1/8
 - 1s - loss: 3.2544 - acc: 0.3553 - val_loss: 1.1052 - val_acc: 0.0000e+00

Epoch 00001: val_loss improved from inf to 1.10521, saving model to fold1.weights.hdf5
Epoch 2/8
 - 0s - loss: 2.7706 - acc: 0.6184 - val_loss: 1.0028 - val_acc: 0.0000e+00

Epoch 00002: val_loss improved from 1.10521 to 1.00275, saving model to fold1.weights.hdf5
Epoch 3/8
 - 0s - loss: 2.0761 - acc: 0.6316 - val_loss: 0.6116 - val_acc: 1.0000

Epoch 00003: val_loss improved from 1.00275 to 0.61163, saving model to fold1.weights.hdf5
Epoch 4/8
 - 0s - loss: 1.6724 - acc: 0.4605 - val_loss: 0.6264 - val_acc: 1.0000

Epoch 00004: val_loss did not improve
Epoch 5/8
 - 0s - loss: 1.0759 - acc: 0.5395 - val_loss: 0.6876 - val_acc: 1.0000

Epoch 00005: val_loss did not improve
Epoch 6/8
 - 0s - loss: 0.2819 - acc: 0.9211 - val_loss: 0.6953 - val_acc: 0.0000e+00

Epoch 00006: val_loss did not improve
Epoch 7/8
 - 0s - loss: 0.2481 - acc: 0.8684 - val_loss: 

In [12]:
accuracies = np.array(cvscores)[:, 1]

In [13]:
print("%.2f (+/- %.2f)" % (np.mean(accuracies), np.std(accuracies)))

0.99 (+/- 0.11)


##  Cross-validated accuracy is higher than guessing the most popular class (diseased -- 47 / (30 + 47) == 61%)