In [1]:
import os
import numpy as np
import pickle

In [2]:
'''
This script is able to build a coocurence matrix from a list of COG of an organism.
It needs to take a csv file containing the list of the COG and the directory where the
COG files are stored.
It returns a numpy array.
'''
def iter_query_cog(query_orth):
    with open(query_orth) as csvfile:
        next(csvfile)
        for l in csvfile:
            buffer = l.split(';')
            yield([buffer[4],buffer[5]])

def cog_2_g_list(cog_id,cog_dir):
    file_path = f'{cog_dir}/COG_{cog_id}.csv'
    g_list = list()
    with open(file_path) as csvfile:
        next(csvfile)
        for l in csvfile:
            buffer = l.split(',')
            strain = buffer[1]
            g_list.append(strain)
    return g_list

def p_dict_builder(query_orth,cog_dir):
    p_dict={}
    i=0
    for cog_id,p_name in iter_query_cog(query_orth):
        g_list = cog_2_g_list(cog_id,cog_dir)
        p_dict[p_name] = g_list
        #i=i+1
        #if i>9:
            #break
    return p_dict

def index_setter(p_dict):
    p_index = list()
    g_index = set()
    for p_name,g_list in p_dict.items():
        p_index.append(p_name)
        g_index.update(g_list)
    g_index = list(g_index)
    return p_index,g_index

def set_M_values(M,m_index,g_name,p_name,value): 
    i = m_index[0].index(p_name) #protein index
    j = m_index[1].index(g_name) #genome index
    M[i,j] = value

def matrix_builder(query_orth,cog_dir,value):
    #generate a dictionnary of protein coocurences
    p_dict=p_dict_builder(query_orth,cog_dir)
    with open('p_dict.pkl', 'wb') as file:
        pickle.dump(p_dict, file)
    #generate the coocurences matrix index
    m_index=index_setter(p_dict)
    with open('m_index.pkl', 'wb') as file:
        pickle.dump(m_index, file)
    #generate the coocurences matrix
    x = len(m_index[0])
    y = len(m_index[1])
    M = np.zeros((x,y))
    for p_name,g_list in p_dict.items():
        for g_name in g_list:
            set_M_values(M,m_index,g_name,p_name,value)
    with open('p_matrix.npy', 'wb') as f:
        np.save(f, M)
    return M

In [3]:
def score_p_dict_builder(score_file_dir):
    path = score_file_dir
    dico_prot={}
    for file in os.listdir(path):
        dico_strain={}
        protein = file.strip('_scores.txt')
        file_path = f'{path}{file}'
        with open(file_path,'r') as file:
            for l in file:
                buffer = l.split(',')
                strain = buffer[2]
                score = buffer[3].strip('\n')
                dico_strain[strain]=score
        dico_prot[protein]=dico_strain
    return(dico_prot)        

In [4]:
def score_matrix_builder(score_file_dir,title):
    if not os.path.isfile(f'p_dict_{title}.pkl'):
        p_dict = score_p_dict_builder(score_file_dir)
        with open(f'p_dict_{title}.pkl', 'wb') as file:
            pickle.dump(p_dict, file)
    else:
        with open(f'p_dict_{title}.pkl', 'rb') as file:
            p_dict = pickle.load(file)
    if not os.path.isfile(f'm_index_{title}.pkl'):
        m_index = index_setter(p_dict)
        with open(f'm_index_{title}.pkl', 'wb') as file:
            pickle.dump(m_index, file)
    else:
        with open(f'm_index_{title}.pkl', 'rb') as file:
            m_index = pickle.load(file)
    x = len(m_index[0])
    y = len(m_index[1])
    M = np.zeros((x,y))
    for p_name,g_list in p_dict.items():
            for g_name in g_list:
                i = m_index[0].index(p_name) #protein index
                j = m_index[1].index(g_name) #genome index
                M[i,j] = p_dict[p_name][g_name]
    #if not os.path.isfile(f'p_matrix_{title}.npy'):
        #with open(f'p_matrix_{title}.npy', 'wb') as f:
            #np.save(f, M)
    #else:
        #with open(f'p_matrix_{title}.npy', 'rb') as f:
            #M = np.load(f,allow_pickle=True)
    return(M)

# KEGG Matrix

In [None]:
def iter_pathway(list_pathway):
    with open(list_pathway) as file:
        for l in file:
            if l.strip():
                buffer = l.split()
                yield(buffer[0])
            
def pathway_dict_builder(list_pathway,path):
    dico = {}
    list_pathway = f'{path}{list_pathway}'
    for pathway in iter_pathway(list_pathway):
        file_path = f'{path}{pathway}.txt'
        protein = []
        with open(file_path) as file:
            for l in file:
                protein.append(l.strip('\n'))
        dico.setdefault(pathway,[]).extend(protein)  
    return(dico)

In [None]:
path = '/Users/mdupuy/Documents/my_project/Pathways/'
list_pathway = 'List_126_KEGG_Path_PA7.txt'
pathway_dict = pathway_dict_builder(list_pathway,path)

In [None]:
m_index = index_setter(pathway_dict)
x = len(m_index[0])
y = len(m_index[1])
M = np.zeros((x,y))
for p_name,g_list in pathway_dict.items():
    for g_name in g_list:
        i = m_index[0].index(p_name) #protein index
        j = m_index[1].index(g_name) #genome index
        M[i,j] = 1

# Binary matrix

In [None]:
%%time
query_orth ='/Users/mdupuy/Documents/Stage/Pseudomonas_aeruginosa_PA7_119_ortholog_groups.csv'
cog_dir='/Users/mdupuy/Documents/Stage/All_COG_groups/'
matrix_builder(query_orth,cog_dir,1)

# Score Matrix

In [14]:
%%time
score_file_dir = '/Users/mdupuy/Documents/Stage/Parser/Scores/'
score_matrix = score_matrix_builder(score_file_dir,'score')

CPU times: user 6min 34s, sys: 4.69 s, total: 6min 39s
Wall time: 7min 29s


In [47]:
import matplotlib.pyplot as plt
def svd(M,threshold):
    '''
    Apply the svd method to a score profile matrix to reduce it noise according to a
    certain threshold
    '''
    M_xmax = np.amax(M,axis=1)
    M_lnorm = np.divide(M.T,M_xmax).T
    print("first normalisation")
    u, s, vh = np.linalg.svd(M_lnorm, full_matrices=False)
    s[threshold:]=0.0
    P = np.dot(u * s, vh)
    print("svd")
    P_norm = np.linalg.norm(P,keepdims=True,axis=0)
    P_u = np.divide(P,P_norm)
    print("second normalisation")
    print("done")
    return P_u

In [56]:
def npp(M):
    M=score_matrix
    # corrige les valeurs à 0 pour prévenir les artefact
    minval = np.min(M[np.nonzero(M)])
    score_matrix[score_matrix==0]=minval
    # normalisation par la taille
    M_xmax = np.amax(M,axis=1)
    M_lnorm = np.divide(M.T,M_xmax).T
    # transformation monotonique
    M_transf = np.reciprocal(M_lnorm)
    # z-score
    M_ymean = np.mean(M_transf,axis=0,keepdims=True)
    M_ystd = np.std(M_transf,ddof=0,axis=0,keepdims=True)
    M = np.divide(np.subtract(M,M_ymean),M_ystd)
    return M

# Distance matrix

In [32]:
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
def distance_matrix_builder(coocurencies_matrix,metric):
    '''
    the metric use to calculate the distance can be:
    - 'hamming' for binary matrix, 
    - 'euclidean' and 'correlation' for score matrix
    '''
    if not os.path.isfile(f'{metric}_distance_matrix.pkl'):
        distance = pdist(coocurencies_matrix, metric=metric)
        distance_matrix = squareform(distance)
        with open(f'{metric}_distance_matrix.pkl', 'wb') as f:
            pickle.dump(distance_matrix,f)
    else:
        with open(f'{metric}_distance_matrix.pkl', 'rb') as file:
            distance_matrix = pickle.load(file)
    return distance_matrix

# Test

In [58]:
M = npp(score_matrix)

In [60]:
dist_matrix = distance(M,'euclidean')
import pandas as pd
import pickle
with open('m_index_score.pkl','rb') as file:
    mynewlist = pickle.load(file)
    p_list = mynewlist[0]
df = pd.DataFrame(dist_matrix)
df.index = p_list
df.columns = p_list
df.to_pickle('test_npp_eu.pkl')