In [110]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, SimpleRNN, LSTM, GRU, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import resample

In [4]:
df = pd.read_csv('../data/raw/exercise_data/C_elegans_acc_seq.csv',header=None,names=['labels','sequences'])

In [53]:
df = pd.read_csv('../data/raw/exercise_data/human_dna_train_split.csv')

In [54]:
df.head()

Unnamed: 0,sequences,labels
0,TTGTGTCCTACTTTTGTCCATTTGGAAAAATAATTGCATGACTACA...,-1
1,CTTTCCTTTATTTCTTCGTCAACTTAATATCCTTAGCAAAACAGGA...,-1
2,TACTTAAGAGGGGTAAGAAATATATAAACTAGTGCAACATTTTTCA...,-1
3,TAGGTTTCCAAGCAGCCCATTCCTGCCTGGCACCACAGGGATCCAT...,-1
4,GCATGAGCCACTGCGCCTGGCCTGGTTCATTGCTTCTTAGTGATGC...,-1


In [55]:
def get_debug_model(splice_length):
    '''model_id: debug'''
    model = Sequential()
    model.add(Conv1D(input_shape=(splice_length,1),filters=32, kernel_size=5, padding='same', activation='relu'))
    model.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5, strides=2))
    model.add(Flatten())
    model.add(Dense(64,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))
    model.summary()
    return model

In [90]:
def encode_seq(seq):
    char2num = {'A':0, 'C': 1, 'G': 2, 'T': 4}
    return [char2num[c] for c in seq]

def preprocess_sequences(df):
    data = df.sequences.values
    data = [encode_seq(d) for d in data]
    data = np.array(data)
    data = data.reshape((data.shape[0],data.shape[1],1))
    labels = df.labels.values
    labels[labels == -1] = 0
    splice_length = len(data[0])
    return data/4, np.int8(labels), splice_length

In [68]:
train, test, y_train, y_test = train_test_split(data, labels, test_size=0.4, stratify=labels, random_state=42)

In [107]:
np.unique(y_train,return_counts=True)[1]#/len(y_train)

array([299117,    883])

In [104]:
weights = len(y_test)/np.unique(y_train,return_counts=True)[1]
weights = weights/weights.min()
print(weights)

[  1.         338.75084938]


In [108]:
def downsample(data, labels, N_per_class, seed=42):
    '''
    Upsample minority classes up to the majority class.
    Returned data is NOT shuffled.
    '''
    CLASSES, N_SAMPLES = np.unique(labels,return_counts=True)
    data_downsampled = []
    labels_downsampled = []
    for c, n in zip(CLASSES, N_SAMPLES):
        data_sub = data[labels==c]
        data_sampled = resample(data_sub,
                                replace=False,
                                n_samples=N_per_class,
                                random_state=seed)
        data_downsampled.append(data_sampled)
        labels_downsampled.append(np.ones(N_per_class,np.int8)*c)

    data_downsampled = np.vstack(data_downsampled)
    labels_downsampled = np.hstack(labels_downsampled)
    return data_downsampled, labels_downsampled

In [105]:
w = compute_class_weight('balanced', y=y_train, classes=np.unique(y_train))
w

array([  0.50147601, 169.87542469])

In [111]:
train, y_train = downsample(train, y_train, 883)

In [125]:
model = get_debug_model(splice_length)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_11 (Conv1D)           (None, 398, 32)           192       
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 398, 32)           5152      
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 197, 32)           0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 6304)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                403520    
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 65        
Total params: 408,929
Trainable params: 408,929
Non-trainable params: 0
________________________________________________

In [126]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])

In [127]:
model.fit(train, y_train, epochs=10, batch_size=16, validation_data=(test,y_test))

Train on 1766 samples, validate on 200000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f38d3c9ff50>

In [128]:
preds = model.predict_classes(test,verbose=1)



In [129]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       1.00      0.73      0.85    199412
           1       0.01      0.88      0.02       588

    accuracy                           0.73    200000
   macro avg       0.50      0.81      0.43    200000
weighted avg       1.00      0.73      0.84    200000



In [130]:
print(confusion_matrix(y_test, preds,normalize='true'))

[[0.73269914 0.26730086]
 [0.11904762 0.88095238]]
