In [1]:
from Bio import SeqIO

gene_file = 'hg19_gene_clean.fa'

e = 0
gene_seqs = []
gene_ids = []
for gene in SeqIO.parse('/home/ubuntu/data/' + gene_file, 
                        'fasta'):
    
    cutoff = 200
    if len(str(gene.seq)) < cutoff:
        continue

    gene_ids.append(str(gene.id))
    s_gene = str(gene.seq)[0:cutoff]
    gene_seqs.append(s_gene)

    e = e + 1
    if e%10000 == 0:
        print('Finished ' + str(e) + ' genes')

def getKmers(sequence, size):
    return [sequence[x:x+size].upper() for x in range(len(sequence) - size + 1)]

kmer = 10
gene_texts = [' '.join(getKmers(i, kmer)) for i in gene_seqs]

Finished 10000 genes
Finished 20000 genes
Finished 30000 genes


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import warnings
warnings.filterwarnings('ignore')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(gene_texts)

encoded_docs = tokenizer.texts_to_sequences(gene_texts)
max_length = max([len(s.split()) for s in gene_texts])
X_gene = pad_sequences(encoded_docs, maxlen = max_length, padding = 'post')

print(X_gene)
print('\n')
print(X_gene.shape)

Using TensorFlow backend.


[[547859 642875 642876 ... 125875 110809  77277]
 [253351 294309 164774 ...  31487 125876 218639]
 [164781  61751 110810 ... 253371 218650 294330]
 ...
 [300782 349547 453994 ...  14457  10644  10929]
 [ 63780 142785 252925 ...  53054 543738 324349]
 [273128 360570 109937 ... 219163  82309 120347]]


(35549, 191)


In [3]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

891013


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import SGD, Adam, Adadelta
from tensorflow.keras.layers import Conv1D, LSTM, Dense, MaxPooling1D, Flatten, Dropout, Embedding, Activation, Bidirectional

model = Sequential()
model.add(Embedding(944238, 10))
model.add(Bidirectional(LSTM(10)))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.load_weights("LSTM.weights.best.hdf5")
model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          9442380   
_________________________________________________________________
bidirectional (Bidirectional (None, 20)                1680      
_________________________________________________________________
dense (Dense)                (None, 10)                210       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 9,444,281
Trainable params: 9,444,281
Non-trainable params: 0
_________________________________________________________________


In [5]:
gene_predictions = model.predict_classes(X_gene)
gene_predictions_prob = model.predict_proba(X_gene)

In [6]:
X_gene.shape

(35549, 191)

In [15]:
gene_predictions[:10]

array([[0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0]], dtype=int32)

In [14]:
gene_predictions_prob[:10]

array([[2.2742575e-01],
       [9.9997020e-01],
       [9.9474698e-01],
       [9.9999785e-01],
       [9.9999785e-01],
       [5.9222716e-09],
       [9.9987686e-01],
       [2.4735549e-04],
       [5.9581845e-07],
       [5.9581845e-07]], dtype=float32)

In [9]:
import pandas as pd
gene_pred_df = pd.DataFrame({'Gene': gene_ids, 
                             'Predict': list(gene_predictions.flatten()), 
                             'Prob': list(gene_predictions_prob.flatten())})
gene_pred_df = gene_pred_df.sort_values(['Prob'], ascending = False)
gene_pred_df[(gene_pred_df['Predict'] == 1) & (gene_pred_df['Prob'] > 0.8)]

Unnamed: 0,Gene,Predict,Prob
6191,chr11:89657232-89666229,1,1.000000
18041,chr2:190628585-190648719,1,1.000000
14403,chr19:1952526-1954548,1,1.000000
26601,chr6:35762759-35765100,1,1.000000
875,chr1:45097640-45117395,1,1.000000
...,...,...,...
7060,chr12:11090853-11091806,1,0.802003
34681,chrX:101967376-102008468,1,0.801439
34679,chrX:101967376-101972661,1,0.801439
34680,chrX:101967376-101979789,1,0.801439


In [17]:
gene_pred_df[(gene_pred_df['Predict'] == 1) & (gene_pred_df['Prob'] > 0.8)].head(10)

Unnamed: 0,Gene,Predict,Prob
6191,chr11:89657232-89666229,1,1.0
18041,chr2:190628585-190648719,1,1.0
14403,chr19:1952526-1954548,1,1.0
26601,chr6:35762759-35765100,1,1.0
875,chr1:45097640-45117395,1,1.0
26602,chr6:35762759-35765121,1,1.0
23619,chr4:140211071-140223705,1,1.0
33536,chr9:134305477-134375575,1,1.0
8252,chr12:109886460-109915105,1,1.0
26606,chr6:35911291-35992277,1,1.0
