In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import subprocess
import numpy as np
from Bio import SeqIO

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD, Adam, Adadelta, RMSprop
from keras.layers import Conv1D, Dense, MaxPooling1D, Flatten, Dropout
from keras.layers import Embedding, GlobalAveragePooling1D, LSTM, SimpleRNN, GRU

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools

Using TensorFlow backend.


In [2]:
intr_file = '../data/hg19_intr_clean.fa'
depl_file = '../data/hg19_depl_clean.fa'

e = 0
intr_seqs = []
depl_seqs = []
for intr, depl in zip(SeqIO.parse(intr_file, 'fasta'), SeqIO.parse(depl_file, 'fasta')):
    
    step = 200; jump = 1; a = 0; b = step; n_jumps = 5
    for j in range(n_jumps):
        s_intr = str(intr.seq)[a:b]
        s_depl = str(depl.seq)[a:b]
        intr_seqs.append(s_intr)
        depl_seqs.append(s_depl)
        a = a + jump
        b = a + step
    
    e = e + 1
    if e%10000 == 0:
        print('Finished ' + str(e) + ' entries')
        
def getKmers(sequence, size):
    return [sequence[x:x+size].upper() for x in range(len(sequence) - size + 1)]
    
kmer = 10
intr_texts = [(getKmers(i, kmer)) for i in intr_seqs]
depl_texts = [(getKmers(i, kmer)) for i in depl_seqs]

Finished 10000 entries
Finished 20000 entries
Finished 30000 entries
Finished 40000 entries
Finished 50000 entries
Finished 60000 entries
Finished 70000 entries


In [4]:
merge_texts = intr_texts + depl_texts
labels = list(np.ones(len(intr_texts))) + list(np.zeros(len(depl_texts)))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(merge_texts)
encoded_docs = tokenizer.texts_to_sequences(merge_texts)
max_length = max([len(s.split()) for s in merge_texts])
X = pad_sequences(encoded_docs, maxlen = max_length, padding = 'post')

AttributeError: 'list' object has no attribute 'split'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size = 0.20, random_state = 42)

In [None]:
X_train1 = X_train[:200000]
X_test1 = X_test[:5000]
y_train1 = y_train[:200000]
y_test1 = y_test[:5000]

In [None]:
import tensorflow.keras
from IPython.display import clear_output

prts=[10] 

for prt in prts: 
    diffs=np.zeros((200-prt+1))
    for i in range(200-prt+1): 
        print(i , "/", X_test1.shape[1])
        X_eva_test=np.array(X_test1)
        
        reverse_word_map = pd.Series(dict(map(reversed, tokenizer.word_index.items()))) # inverted tokenizer
        X_tmp0=np.array(reverse_word_map[X_eva_test.reshape(X_eva_test.shape[0]*X_eva_test.shape[1])]).reshape(
            X_eva_test.shape[0],X_eva_test.shape[1]) # coverti tutto in kmers, serve aggiustare la shape

        X_tmp0=X_tmp0[:,range(0,X_tmp0.shape[1],kmer)]  # becca solo kmer che non overlap
        X_tmp0=np.array(list(np.sum(np.sum(X_tmp0,axis=1), axis=0))).reshape(X_tmp0.shape[0], 200) 
        #qua otteniamo una matrice di Nx200 nucleotidi
        
        range_helper=[k for k in range(0,X_tmp0.shape[0])]
        np.random.shuffle(range_helper) # shuffle
        X_tmp0[:,i:i+prt]=X_tmp0[range_helper,i:i+prt] # applica shuffle
        X_tmp0=[''.join(X_tmp0[j,:]) for j in range(X_tmp0.shape[0])] # recupera sequenza nucleotidi
        X_tmp0=[' '.join(getKmers(seq,kmer)) for seq in X_tmp0] # recupera sequenza kmers
    
        #tokenizer
        X_tmp0=tokenizer.texts_to_sequences(X_tmp0)
        X_tmp0= pad_sequences(X_tmp0, maxlen = max_length, padding = 'post')
        print(X_tmp0.shape)
        
        c=model.evaluate(X_tmp0,np.array(y_test1))[1]
        diffs[i]=(1-c) # 1 - accuracy = DeltaA
    
        clear_output(wait=True)
        
        print(" iteration ", i, " : perturbated res = ", c , "saved value = " , diffs[i])
        print("Debug print")
        print(X_eva_test[0][10:20]) 
        print(X_tmp0[0][10:20])
        print(X_tmp0.shape)

    np.savetxt("important_locs_nnn"+str(prt)+"_.txt", diffs)    
    plt.scatter(range(0,len(diffs)), diffs) 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from IPython.display import clear_output

intr_texts =np.array(intr_texts)
depl_texts =np.array(depl_texts) 

freqs_intr=[]

datao = np.loadtxt("important_locs.txt")
datao=np.convolve(datao,[1,2,3,4,5,6,7,8,9,10,9,8,7,6,5,4,3,2,1], "same")

# second derivative for C prescription
datao2=(datao[2:]-2*datao[1:-1]+datao[:-2])/sum(abs(datao))
datao2=np.array(np.convolve(datao2,[1,2,3,4,5,6,5,4,3,2,1], "same"))

datao=datao-min(datao)

#### C ####
datao2[datao2>0]=0
datao2[datao2<0]=1
datao2=np.concatenate((np.array([0.0,0.0]), datao2.astype(int)))
###########

plt.plot(range(len(datao)-2), (datao[2:]), label="all")
plt.legend()
plt.show()

In [None]:
#### A ####
mydicti=Counter()
for i in range(intr_texts.shape[1]):
    counts = Counter(intr_texts[:,i])
    for k in counts.keys():
        counts[k]*=datao[i]
    mydicti+=counts

#mydictd=Counter()    
for i in range(depl_texts.shape[1]):
    clear_output(wait=True)
    print(i)
    counts = Counter(depl_texts[:,i])
    for k in counts.keys():
        counts[k]*=datao[i]
    mydicti+=counts

In [None]:
print(np.array(mydicti.most_common(50))[:,0])

In [None]:
#### C ####
datao1=datao*datao2
datao1=datao1/sum(datao1)

mydicti=Counter()
for i in range(intr_texts.shape[1]):
    counts = Counter(intr_texts[:,i])
    for k in counts.keys():
        counts[k]*=datao1[i]
    mydicti+=counts

#mydictd=Counter()   
for i in range(depl_texts.shape[1]):
    clear_output(wait=True)
    print(i)
    counts = Counter(depl_texts[:,i])
    for k in counts.keys():
        counts[k]*=datao1[i]
    mydicti+=counts

In [None]:
print(np.array(mydicti.most_common(50))[:,0])

In [None]:
#### B ####
mydicti=Counter()
for i in range(intr_texts.shape[1]):
    counts = Counter(intr_texts[:,i])
    for k in counts.keys():
        counts[k]*=1
    mydicti+=counts

#mydictd=Counter()
for i in range(depl_texts.shape[1]):
    clear_output(wait=True)
    print(i)
    counts = Counter(depl_texts[:,i])
    for k in counts.keys():
        counts[k]*=1
    mydicti+=counts

In [None]:
print(np.array(mydicti.most_common(50))[:,0])

In [None]:
#### D #### 
maxpos=[]
maxdict=Counter() 

for i in range(len(datao)-1):
    if(datao[i]>datao[i-1] and datao[i]>datao[i+1]):
        maxpos.append(i)

for i in maxpos:
    counts = Counter(intr_texts[:,i])
    for k in counts.keys():
        counts[k]*=1
    maxdict+=counts
     
for i in maxpos:
    clear_output(wait=True)
    print(i)
    counts = Counter(depl_texts[:,i])
    for k in counts.keys():
        counts[k]*=1
    maxdict+=counts

In [None]:
print(np.array(maxdict.most_common(50)))