In [2]:
import os
import numpy as np
import pickle
#from tqdm.notebook import trange, tqdm

In [3]:
'''
This script is able to build a coocurence matrix from a list of COG of an organism.
It needs to take a csv file containing the list of the COG and the directory where the
COG files are stored.
It returns a numpy array.
'''
def iter_query_cog(query_orth):
    with open(query_orth) as csvfile:
        next(csvfile)
        for l in csvfile:
            buffer = l.split(';')
            yield([buffer[4],buffer[5]])

def cog_2_g_list(cog_id,cog_dir):
    file_path = f'{cog_dir}/COG_{cog_id}.csv'
    g_list = list()
    with open(file_path) as csvfile:
        next(csvfile)
        for l in csvfile:
            buffer = l.split(',')
            strain = buffer[1]
            g_list.append(strain)
    return g_list

def p_dict_builder(query_orth,cog_dir):
    p_dict={}
    i=0
    for cog_id,p_name in iter_query_cog(query_orth):
        g_list = cog_2_g_list(cog_id,cog_dir)
        p_dict[p_name] = g_list
        #i=i+1
        #if i>9:
            #break
    return p_dict

def index_setter(p_dict):
    p_index = list()
    g_index = set()
    for p_name,g_list in p_dict.items():
        p_index.append(p_name)
        g_index.update(g_list)
    g_index = list(g_index)
    return p_index,g_index

def set_M_values(M,m_index,g_name,p_name,value): 
    i = m_index[0].index(p_name) #protein index
    j = m_index[1].index(g_name) #genome index
    M[i,j] = value

def matrix_builder(query_orth,cog_dir,value):
    #generate a dictionnary of protein coocurences
    p_dict=p_dict_builder(query_orth,cog_dir)
    with open('p_dict.pkl', 'wb') as file:
        pickle.dump(p_dict, file)
    #generate the coocurences matrix index
    m_index=index_setter(p_dict)
    with open('m_index.pkl', 'wb') as file:
        pickle.dump(m_index, file)
    #generate the coocurences matrix
    x = len(m_index[0])
    y = len(m_index[1])
    M = np.zeros((x,y))
    for p_name,g_list in p_dict.items():
        for g_name in g_list:
            set_M_values(M,m_index,g_name,p_name,value)
    with open('p_matrix.npy', 'wb') as f:
        np.save(f, M)
    return M

In [4]:
def score_p_dict_builder(score_file_dir):
    path = score_file_dir
    dico_prot={}
    for file in os.listdir(path):
        dico_strain={}
        protein = file.strip('_scores.txt')
        file_path = f'{path}{file}'
        with open(file_path,'r') as file:
            for l in file:
                buffer = l.split(',')
                strain = buffer[2]
                score = buffer[3].strip('\n')
                dico_strain[strain]=score
        dico_prot[protein]=dico_strain
    return(dico_prot)        

In [52]:
def score_matrix_builder(score_file_dir):
    if not os.path.isfile('p_dict_score.pkl'):
        p_dict = score_p_dict_builder(score_file_dir)
        with open('p_dict_score.pkl', 'wb') as file:
            pickle.dump(p_dict, file)
    else:
        with open('p_dict_score.pkl', 'rb') as file:
            p_dict = pickle.load(file)
    if not os.path.isfile('m_index_score.pkl'):
        m_index = index_setter(p_dict)
        with open('m_index_score.pkl', 'wb') as file:
            pickle.dump(m_index, file)
    else:
        with open('m_index_score.pkl', 'rb') as file:
            m_index = pickle.load(file)
    x = len(m_index[0])
    y = len(m_index[1])
    M = np.zeros((x,y))
    for p_name,g_list in p_dict.items():
            for g_name in g_list:
                i = m_index[0].index(p_name) #protein index
                j = m_index[1].index(g_name) #genome index
                M[i,j] = p_dict[p_name][g_name]
    with open('p_matrix_score.npy', 'wb') as f:
        np.save(f, M)
    return(M)

# Binary matrix

In [None]:
%%time
query_orth ='/Users/mdupuy/Documents/Stage/Pseudomonas_aeruginosa_PA7_119_ortholog_groups.csv'
cog_dir='/Users/mdupuy/Documents/Stage/All_COG_groups/'
matrix_builder(query_orth,cog_dir,1)

# Score Matrix

In [53]:
%%time
score_file_dir = '/Users/mdupuy/Documents/Stage/Parser/Scores/'
score_matrix = score_matrix_builder(score_file_dir)

CPU times: user 2min 58s, sys: 1.61 s, total: 2min 59s
Wall time: 3min 2s


In [23]:
def svd(M,threshold):
    '''
    Apply the svd method to a score profile matrix to reduce it noise according to a
    certain threshold
    '''
    M_xmax = np.amax(M,axis=1)
    M_lnorm = np.divide(M.T,M_xmax).T
    M_lnorm = np.nan_to_num(M_lnorm)
    print("first normalisation")
    u, s, vh = np.linalg.svd(M_lnorm, full_matrices=False)
    s[threshold:]=0.0
    P = np.dot(u * s, vh)
    print("svd")
    P_norm = np.linalg.norm(P,keepdims=True,axis=0)
    P_u = np.divide(P,P_norm)
    print("second normalisation")
    print("done")
    return P_u

In [42]:
def npp(M):
    M=score_matrix
    # corrige les valeurs à 0 pour prévenir les artefact
    minval = np.min(M[np.nonzero(M)])
    score_matrix[score_matrix==0]=minval
    # normalisation par la taille
    M_xmax = np.amax(M,axis=1)
    M_lnorm = np.divide(M.T,M_xmax).T
    # transformation monotonique
    M_transf = np.reciprocal(M_lnorm)
    # z-score
    M_ymean = np.mean(M_transf,axis=0,keepdims=True)
    M_ystd = np.std(M_transf,ddof=0,axis=0,keepdims=True)
    M = np.divide(np.subtract(M,M_ymean),M_ystd)
    return M

[[0.00000000e+00 6.67190749e-01 7.16478705e-01 ... 7.03611945e-01
  7.17104513e-01 7.85293724e-01]
 [6.67190749e-01 0.00000000e+00 2.37613320e-03 ... 2.25606867e-03
  1.70018793e-03 8.49334288e-03]
 [7.16478705e-01 2.37613320e-03 0.00000000e+00 ... 1.20317801e-03
  7.47370580e-04 3.58443378e-03]
 ...
 [7.03611945e-01 2.25606867e-03 1.20317801e-03 ... 0.00000000e+00
  8.60653430e-04 4.42645866e-03]
 [7.17104513e-01 1.70018793e-03 7.47370580e-04 ... 8.60653430e-04
  0.00000000e+00 2.88912517e-03]
 [7.85293724e-01 8.49334288e-03 3.58443378e-03 ... 4.42645866e-03
  2.88912517e-03 0.00000000e+00]]


# Distance Matrix

In [None]:
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
def hamming(coocurencies_matrix):
    hamming = pdist(coocurencies_matrix, metric='hamming')
    distance_matrix = squareform(hamming)
    with open('hd_matrix.pkl', 'wb') as f:
        pickle.dump(distance_matrix,f)
    return distance_matrix

In [None]:
d_M = hamming(M)