In [3]:
import numpy as np
import pickle
from tqdm.notebook import trange, tqdm

In [11]:
'''
This script is able to build a coocurence matrix from a list of COG of an organism.
It needs to take a csv file containing the list of the COG and the directory where the
COG files are stored.
It returns a numpy array.
'''
def iter_query_cog(query_orth):
    with open(query_orth) as csvfile:
        next(csvfile)
        for l in csvfile:
            buffer = l.split(';')
            yield([buffer[4],buffer[5]])

def cog_2_g_list(cog_id,cog_dir):
    file_path = f'{cog_dir}/COG_{cog_id}.csv'
    g_list = list()
    with open(file_path) as csvfile:
        next(csvfile)
        for l in csvfile:
            buffer = l.split(',')
            strain = buffer[1]
            g_list.append(strain)
    return g_list

def p_dict_builder(query_orth,cog_dir):
    p_dict={}
    i=0
    for cog_id,p_name in iter_query_cog(query_orth):
        g_list = cog_2_g_list(cog_id,cog_dir)
        p_dict[p_name] = g_list
        #i=i+1
        #if i>9:
            #break
    return p_dict

def index_setter(p_dict):
    p_index = list()
    g_index = set()
    for p_name,g_list in p_dict.items():
        p_index.append(p_name)
        g_index.update(g_list)
    g_index = list(g_index)
    return p_index,g_index

def set_M_values(M,m_index,g_name,p_name,value): 
    i = m_index[0].index(p_name) #protein index
    j = m_index[1].index(g_name) #genome index
    M[i,j] = value

def matrix_builder(query_orth,cog_dir,value):
    #generate a dictionnary of protein coocurences
    p_dict=p_dict_builder(query_orth,cog_dir)
    with open('p_dict.pkl', 'wb') as file:
        pickle.dump(p_dict, file)
    #generate the coocurences matrix index
    m_index=index_setter(p_dict)
    with open('m_index.pkl', 'wb') as file:
        pickle.dump(m_index, file)
    #generate the coocurences matrix
    x = len(m_index[0])
    y = len(m_index[1])
    M = np.zeros((x,y))
    for p_name,g_list in p_dict.items():
        for g_name in g_list:
            set_M_values(M,m_index,g_name,p_name,value)
    with open('p_matrix.npy', 'wb') as f:
        np.save(f, M)
    return M

In [12]:
def score_matrix_builder(score_file_dir):
    path = score_file_dir
    dico_prot={}
    for file in os.listdir(path):
        dico_strain={}
        protein = file.strip('_scores.txt')
        file_path = f'{path}{file}'
        dico_prot[protein]=dico_strain
        with open(file_path,'wb') as file:
            for l in file:
                buffer = l.split(',')
                strain = buffer[2]
                score = buffer[3]
                dico_strain[strain]=score

In [13]:
%%time
query_orth ='/Users/mdupuy/Documents/Stage/Pseudomonas_aeruginosa_PA7_119_ortholog_groups.csv'
cog_dir='/Users/mdupuy/Documents/Stage/All_COG_groups/'
matrix_builder(query_orth,cog_dir,1)

CPU times: user 3min 58s, sys: 5.03 s, total: 4min 3s
Wall time: 5min 30s


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [14]:
%%time
with open('p_dict.pkl','rb') as file:
    p_dict = pickle.load(file)
with open('m_index.pkl','rb') as file:
    m_index = pickle.load(file)
x = len(m_index[0])
y = len(m_index[1])
M = np.zeros((x,y))
for p_name,g_list in p_dict.items():
    for g_name in g_list:
        set_M_values(M,m_index,g_name,p_name,1)

CPU times: user 3min 39s, sys: 408 ms, total: 3min 40s
Wall time: 3min 40s


In [None]:
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
def hamming(coocurencies_matrix):
    hamming = pdist(coocurencies_matrix, metric='hamming')
    distance_matrix = squareform(hamming)
    with open('hd_matrix.pkl', 'wb') as f:
        pickle.dump(distance_matrix,f)
    return distance_matrix

In [None]:
d_M = hamming(M)