In [1]:
import os
import pandas as pd
import numpy as np

# Préparation de la matrice de profil phylogénétique

La matrice de profil phylogénétique est réalisé en récupérant la liste des proteines, la
liste des espèces bactériennes et l'information de présence ou d'absence d'une protéine
chez une espèce donnée.

# Binary phylogenetic matrix

In [None]:
if not os.path.isfile('binary_phylogenetic_matrix.pkl'):
    #build a dataframe of PA7 ortholog groups
    file_path ='/Users/mdupuy/Documents/Stage/Pseudomonas_aeruginosa_PA7_119_ortholog_groups.csv'
    df = pd.read_csv(file_path, delimiter=';')
    #iterate through the dataframe to fill a dictionnary of phylogenetic profile
    dico = {}
    for row in df.itertuples():
        protein = row[6] #Locus tag
        id = row[5] #PGD Gene ID
        file_path = f'/Users/mdupuy/Documents/Stage/All_COG_groups/COG_{id}.csv'
        dataFrame = pd.read_csv(file_path)
        strains = dataFrame['Strain'].to_list()
        dico.setdefault(protein,[]).extend(strains)
    #inverse the dictionnary
    dicoinv={}
    for keys, values in dico.items(): 
        for value in values: 
            dicoinv.setdefault(value,[]).append(keys)
    #build a dataframe of binary phylogenetic profile, with protein id as index
    # and bacterian species as columns 
    df = pd.DataFrame(data=0,index=dico.keys(),columns=dicoinv.keys())
    for index in df.index:
        for column in df.columns:
            if column in dico[index]:
                df.loc[index,column] = 1
    save_dataframe('binary_phylogenetic_matrix.pkl')

In [2]:
b_phylo_matrix = pd.read_pickle('data/binary_phylogenetic_matrix.pkl')

Si besoin il est possible de "débinariser" la matrice en appliquant des poids grâce à la méthode suivante:

In [None]:
def weighted_phylogenetic_matrix(phylogenetic_matrix):
    #Return a weighted phylogenetic matrix from a binary phylogenetic matrix using inverse homology
    g_size = len(df.index)
    for column in df.columns:
        homologue = df[column].to_numpy()
        h_number = np.sum(homologue)
        score = h_number/g_size
        df[column].replace(1, score, inplace=True)
        df.to_pickle('nonbinary_phylogenetic_matrix.pkl')

In [None]:
w_phylo_matrix = weighted_phylogenetic_matrix(b_phylo_matrix)

# Score phylogenetic matrix

In [None]:
if not os.path.isfile('score_phylogenetic_matrix.pkl'):    
    path = '/Users/mdupuy/Documents/Stage/Parser/Scores/'
    dico_prot={}
    for file in os.listdir(path):
        dico_strain={}
        df = pd.read_csv(f'{path}{file}', header=None)
        protein = file.strip('_scores.txt')
        dico_prot[protein]=dico_strain
        for row in df.itertuples():
            strain = row[3]
            score = row[4]
            dico_strain[strain]=score
    df = pd.DataFrame(dico_prot)
    df.to_pickle('score_phylogenetic_matrix.pkl')

Une étape de prétraitement des matrice de score est nécessaire

In [3]:
def npp(df):
    # remplace les valeurs manquante
    df = df.fillna(0)
    # corrige les valeurs à 0 pour prévenir les artefact
    minimum = df[df > .01].min()
    minimum = minimum.min()
    df = df.where(df > minimum, minimum)
    # normalisation par la taille
    df = df.divide(df.max(axis=1),axis=0)
    # transformation monotonique
    df = 1/df
    # z-score
    df = (df - df.mean(axis=0))/df.std(ddof=0, axis=0)
    #df = phylo_to_distance_matrix(df.T,2)
    #df = df.corr()
    #df = df.where(df>0,0)
    return df

Si la matrice utilisé est une matrice de score il est possible d'utiliser la décomposition en valeur singulière:

In [4]:
def svd(df,threshold):
    df = df.fillna(0)
    # normalise chaque ligne par le score max de la ligne
    df = df.divide(df.max(axis=1),axis=0)
    print("first normalisation")
    # Apply the svd method to a score profile matrix to reduce it noise according to a
    #certain threshold
    u, s, vh = np.linalg.svd(df, full_matrices=False)
    threshold = np.round_(len(s)*threshold//100)
    print(threshold)
    s[threshold:]=0
    s = np.diag(s)
    P = u.dot(s.dot(vh))
    print("svd")
    # convertion des profil phylogénétique en vecteur unitaire
    P_norm = np.linalg.norm(P,axis=1,keepdims=True)
    P_u = P/P_norm
    print("second normalisation")
    # Calacul de la corrélation de Pearson
    df = pd.DataFrame(P_u,index=df.index,columns=df.columns)
    #df = df.corr()
    #print("correlation")
    #print("distance")
    #df = phylo_to_distance_matrix(df.T,2)
    #df = df.where(df>0,0)
    #print("treat negative")
    print("done")
    return df

In [5]:
score_matrix = pd.read_pickle("score_phylogenetic_matrix.pkl")

In [7]:
npp_phylo_matrix = npp(score_matrix)

# 1 - Réalisation de calcul de distance, corrélation et similarité

A partir de notre matrice de profil phylogénétique nous pouvons calculer la distance, la corrélation ou la similarité entre deux protéines.

In [8]:
from scipy.spatial import distance_matrix as dm
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.spatial.distance import cdist

In [13]:
def phylo_to_distance_matrix(phylogenetic_matrix, p):
    #Build the distance matrix with the Minkowski methode
    df = phylogenetic_matrix
    if p == 1:
        methode = 'manhatthan' 
    elif p == 2:
        methode = 'euclidean'
    else:
        methode = 'minkowski'
    distance_matrix = dm(df, df, p)
    distance_df = pd.DataFrame(
    distance_matrix,
    index = df.index,
    columns = df.index
    )
    #distance_df = distance_df.div(distance_df.values.max())
    return distance_df

In [10]:
def hamming(phylogenetic_matrix_path):
    df = pd.read_pickle(f'{phylogenetic_matrix_path}')
    hamming = pdist(df, metric='hamming')
    distance_matrix = squareform(hamming)
    distance_df = pd.DataFrame(
    distance_matrix,
    index = df.index,
    columns = df.index
    )
    #distance_df = distance_df.div(distance_df.values.max())
    distance_df.to_pickle(f'hamming_distance_matrix.pkl')
    return distance_df

In [14]:
distance_matrix = phylo_to_distance_matrix(score_matrix.T,2)

In [15]:
distance_matrix

Unnamed: 0,PSPA7_3376,PSPA7_2475,PSPA7_3669,PSPA7_6369,PSPA7_1922,PSPA7_2608,PSPA7_3414,PSPA7_6114,PSPA7_0943,PSPA7_2317,...,PSPA7_5214,PSPA7_1508,PSPA7_1986,PSPA7_0714,PSPA7_4275,PSPA7_5069,PSPA7_1775,PSPA7_0569,PSPA7_5576,PSPA7_0076
PSPA7_3376,,,,,,,,,,,...,,,,,,,,,,
PSPA7_2475,,,,,,,,,,,...,,,,,,,,,,
PSPA7_3669,,,,,,,,,,,...,,,,,,,,,,
PSPA7_6369,,,,,,,,,,,...,,,,,,,,,,
PSPA7_1922,,,,,0.0,,,56505.033513,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PSPA7_5069,,,,,,,,,,,...,,,,,,,,,,
PSPA7_1775,,,,,,,,,,,...,,,,,,,,,,
PSPA7_0569,,,,,,,,,,,...,,,,,,,,,,
PSPA7_5576,,,,,,,,,,,...,,,,,,,,,,


# 2 - Définition de la matrice d'intéraction à partir d'un seuil

In [None]:
def adjency(df, threshold):
    # Should I add a feature to determine if condition should be greater or lower
    #depending on the type of data ?
    df = df.applymap(lambda x: 1 if x<=threshold else 0)
    return df

In [None]:
matrix = pd.read_pickle('hamming_distance_matrix.pkl')
predicted_adjency = adjency(matrix, 0)

# 3 - Comparaison de la matrice d'intéraction avec Kegg

Premièrement nous devons récupérer la liste des protéines présentent pour chaque pathways

In [None]:
from bioservices import KEGG
k = KEGG(verbose=False)

In [None]:
problematic_pathway = []
for path in pathway_list:
    res = k.get(f"path:{path}")
    d = k.parse(res)
    with open(f"{path}.txt","w") as file:
        if 'GENE' in d.keys():
            for gene in d['GENE']:
                file.write(f'{gene}\n')
        else:
            file.write('gene not found')
            problematic_pathway.append(path)
            continue

# Test de précision

Obtenir la liste de toutes les protéines contenues dans tous les pathways

In [None]:
path = '/Users/mdupuy/Documents/my_project/Pathways'
dico = {}

for file in os.listdir(path):
    pathway = file.strip('.txt')
    gene = []
    with open (f'{path}/{file}') as file:
        for line in file:
            if 'PSPA7' in line:
                gene.append(line.strip('\n'))
    dico.setdefault(pathway,[]).extend(gene)

#observed_adjency = pd.DataFrame(data=0,index=gene,columns=gene)

In [None]:
dicoinv={}
for keys, values in dico.items(): 
    for value in values: 
        dicoinv.setdefault(value,[]).append(keys)

Former une matrice d'adjacence observée

In [None]:
gene = set(dicoinv.keys())
observed_adjency = pd.DataFrame(data=0,index=gene,columns=gene)
for index in observed_adjency.index:
        for column in observed_adjency.columns:
            if not set(dicoinv[index]).isdisjoint(set(dicoinv[column])):
                observed_adjency.loc[index,column] = 1

In [None]:
observed_adjency.to_pickle('data/observed_adjency_matrix.pkl')

In [None]:
def df_intersect(df):
    print('intersecting')
    distance_matrix = df
    observed_adjency = pd.read_pickle('data/observed_adjency_matrix.pkl')

    d_ind = distance_matrix.index

    o_ind = observed_adjency.index

    ind = d_ind.intersection(o_ind)

    distance_matrix = distance_matrix.reindex(index=ind, columns=ind)

    dico = distance_matrix.to_dict('split')
    print('intersected')
    return dico

In [None]:
def get_couples(index_list):
    for i in range(len(index_list)):
        for j in range(i+1,len(index_list)):
            yield((index_list[i],index_list[j]))

def couple_sorter(df):
    print('sorting')
    couple_dist={}
    dico = df.to_dict('split')
    for couple in get_couples(dico['index']):
        dist=df.loc[couple]
        couple_dist.setdefault(dist,[]).append(couple)
    couple_dist_ord = dict(sorted(couple_dist.items(),reverse=False))
    #print(couple_dist_ord)
    print('sorted')
    return couple_dist_ord

In [None]:
def courbe_rc(dico,shuffle):
    print('ploting')
    observed_adjency = pd.read_pickle('data/observed_adjency_matrix.pkl')
    P_list = []
    TP = 0
    FP = 0
    n = 1
    value_list = list(dico.values())
    if shuffle == True:
        random.shuffle(value_list)
    for value in value_list:
        for couple in value:
            if (observed_adjency.loc[couple[0],couple[1]]) == 1:
                TP = TP+1
            else:
                FP = FP+1
            n=n+1
            if n==1000:
                Precision = TP/(TP+FP)
                P_list.append(Precision)
                n = 1
    print('ploted')
    return P_list

In [None]:
def benchmark(list_path):
    dico_rc = {}
    for path in list_path:
        distance_matrix = pd.read_pickle(path)
        intersect_matrix = df_intersect(distance_matrix)
        couple_dist = couple_sorter(intersect_matrix)
        rc_ord = courbe_rc(couple_dist, False)
        #rc_rand = courbe_rc(dico_dist, True)
        dico_rc[path] = rc_ord
    rc_rand = courbe_rc(couple_dist, True)
    dico_rc['rand'] = rc_rand
    return dico_rc

Effectuer test de comparaison entre matrice d'adjacence prédite et observée

In [None]:
observed_adjency = pd.read_pickle('data/observed_adjency_matrix.pkl')

quantiles = [quantile/10 for quantile in range(0,11,1)]
quantiles = np.quantile(distance_matrix, quantiles)
print(quantiles)

dico_rc = {}
dico_ROC = {}
#build the predicted adjency matrix
for threshold in quantiles:
    #predicted the adjency matrix from the distance matrix for a specific threshold
    predicted_adjency = adjency(distance_matrix, threshold)
    predicted_adjency = predicted_adjency.reindex_like(observed_adjency)
    #print(predicted_adjency)
    #gathered the upper triangular matrix of the two adjency matrix
    observed = observed_adjency.to_numpy()
    predicted = predicted_adjency.to_numpy()
    observed = observed[np.triu_indices_from(observed,1)]
    predicted = predicted[np.triu_indices_from(predicted,1)]
    #build the confusion matrix
    confusion_matrix = pd.crosstab(predicted,observed,rownames=['predicted'], colnames=['observed'])
    print(confusion_matrix)
    #calculate recall and precision and stock them in a dictionnary
    '''    
    TP = confusion_matrix.loc[1,1] 
    FP = confusion_matrix.loc[1,0]
    TN = confusion_matrix.loc[0,0]
    FN = confusion_matrix.loc[0,1]
    try:
        TP = confusion_matrix.loc[1,1] 
        FP = confusion_matrix.loc[1,0]
        TN = confusion_matrix.loc[0,0]
        FN = confusion_matrix.loc[0,1]
        print(f'TP:{TP}, FP:{FP}, TN:{TN},FN:{FN}')
    '''
    if not 0 in confusion_matrix.columns:
        TN = 0
        FN = 0
        print('a')
    if not 1 in confusion_matrix.columns:
        TP = 0
        FP = 0
        print('b')
    if not 0 in confusion_matrix.index:
        TN = 0
        FP = 0
        print('c')
    if not 1 in confusion_matrix.index:
        TP = 0
        FN = 0
        print('d')
    else:
        TP = confusion_matrix.loc[1,1] 
        FP = confusion_matrix.loc[1,0]
        TN = confusion_matrix.loc[0,0]
        FN = confusion_matrix.loc[0,1]
        print(f'TP:{TP}, FP:{FP}, TN:{TN},FN:{FN}')
    try:    
        Recall = TP/(TP+FN)
        Precision = TP/(TP+FP)
        TPR = TP/(TP+FN)
        FPR = FP/(FP+TN)
    except:
        continue
    dico_rc[threshold] = (Recall,Precision)
    dico_ROC[threshold] = (FPR,TPR)
    print(f'threshold: {threshold} R & P: {dico_rc[threshold]}')
    print(f'threshold: {threshold} FPR & TPR: {dico_ROC[threshold]}')

In [None]:
dico_roc = {} #key threshold, value (FPR,TPR)

In [None]:
#return predicted adjency matrix according to a threshold and organized like the obseved adjency matrix
matrix = pd.read_pickle('hamming_distance_matrix.pkl')
predicted_adjency = adjency(matrix, 0)
predicted_adjency = predicted_adjency.reindex_like(observed_adjency)

In [None]:
sub_matrix = predicted_adjency.reindex_like(observed_adjency)