# Motif Comparision

#### This file is used to compared motifs learned or selected by Lasso, GLM of Logit and CNN

In [1]:
import numpy as np
import matplotlib as plt
import pandas as pd

In [2]:
# Some helper functions
def convertMotif(motif):
    '''Convert motif into matrix'''
    #One hot encoding 
    sequence_vector = []
    for c in motif:
        if c == 'A':
            sequence_vector.append([1,0,0,0])
        elif c == 'C':
            sequence_vector.append([0,1,0,0])
        elif c == 'G':
            sequence_vector.append([0,0,1,0])
        else:
            sequence_vector.append([0,0,0,1])
    #print(np.array(sequence_vector).shape)
    return np.array(sequence_vector)
def convertMatrix(matrix):
    '''Convert matrix into motifs'''
    m = matrix.T
    motif = ''
    for i in m:
        index = np.where(i==max(i))[0]
        if index == 0:
            motif += 'A'
        elif index == 1:
            motif += 'C'
        elif index == 2:
            motif += 'G'
        else:
            motif += 'T'
    return motif

def normalize(m):
    '''Normalize Matrix'''
    new_matrix = []
    for i in range(m.shape[1]):
        col = m[:,i]
        s = sum(col)
        col = col/s
        new_matrix.append(col)
    new_matrix = np.array(new_matrix).T
    return new_matrix

## First, Let's compare GLM and LASSO

In [34]:
# Load the weights we trained From CNN neural network
CNN_weight = np.load('CNN_weight.npy')

In [35]:
# We want to take care of motifs that has a negative effect on DNA methylation. 
# Therefore we take the absolute value of each PWM
PWM = [normalize(m) for m in CNN_weight]

In [36]:
# Read in DNA Motifs
Kmers6 = [m[0] for m in pd.read_csv("../data/Kmers_K6_sequences.csv",header = None).as_matrix()]

In [37]:
motifs = [convertMotif(m).T for m in Kmers6]

In [48]:
CNN = []
for i,m in enumerate(motifs):
    product = []
    for p in PWM:
        s = np.multiply(m,p)
        product.append(sum(sum(s)))
    CNN.append( (i,max(product)) )
CNN.sort(key = lambda x: x[1], reverse = True)
CNN_order = [s[0] for s in CNN]
CNN_dict = {}
for s in CNN:
    CNN_dict[s[0]] = s[1]

In [26]:
# Read in GLM Params
GLM = np.load('GLM_Binomial.npy')
GLM = sorted([(i,p) for i,p in enumerate(GLM) if i != 2080], key = lambda x: x[1],reverse=True)

In [27]:
# Read in LASSO Params
LASSO = np.load('Lasso_params.npy')
LASSO = sorted([i for i,p in enumerate(LASSO) if i != 2080], key = lambda x: x[1], reverse = True)

In [44]:
GLM_order = [i[0] for i in GLM]
LASSO_order = [i[0] for i in LASSO]
GLM_dict = {}
LASSO_dict = {}
for i in range(len(LASSO)):
    LASSO_dict[LASSO[i][0]] = LASSO[i][1]
    GLM_dict[GLM[i][0]] = GLM[i][1]

In [32]:
# Compare how much these two lists containing the same order
def compare_two_lists(list1,list2):
    num_match = 0
    for i,motif in enumerate(list1):
        previous_1 = list1[:i]
        j = list2.index(motif)
        if i != j:
            continue
        previous_2 = list2[:j]
        if set(previous_1) == set(previous_2):
            num_match += 1
    return num_match*100/len(list2)      

In [47]:
print('Check if three lists\' order matches')
print("LASSO vs GLM: ",compare_two_lists(LASSO_order,GLM_order))
print("GLM vs CNN: ", compare_two_lists(CNN_order,GLM_order))
print("LASSO vs CNN: ", compare_two_lists(CNN_order,LASSO_order))

Check if three lists' order matches
LASSO vs GLM:  0.0
GLM vs CNN:  0.0
LASSO vs CNN:  0.0


In [50]:
def motif_importance_comparison(dict1,dict2):
    num_agree = 0
    num_total = 0
    for i in range(len(dict1)):
        for j in range(i+1,len(dict1)):
            if not (dict1[i] >= dict1[j] != dict2[i] >= dict2[j]):
                num_agree +=1
            num_total += 1
    return num_agree*100/num_total

In [54]:
print("Enumerate each pair of motifs, and check if their weights order in different classifier match")
print("LASSO vs GLM: {0}%".format(motif_importance_comparison(LASSO_dict,GLM_dict)))
print("GLM vs CNN: {0}%".format(motif_importance_comparison(GLM_dict,CNN_dict)))
print("LASSO vs CNN: {0}%".format(motif_importance_comparison(CNN_dict,LASSO_dict)))

Enumerate each pair of motifs, and check if their weights order in different classifier match
LASSO vs GLM: 62.862045362045365%
GLM vs CNN: 78.24545824545825%
LASSO vs CNN: 74.67957967957967%
