In [2]:
import pandas as pd
import numpy as np
import sklearn as skl 
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from scipy.stats import ttest_ind
from sklearn.decomposition import PCA
from Bio import SeqIO

# Inputs

In [3]:
# import clumps counts of eff, non-eff dataset
clumps_in_eff = pd.read_csv("../demo/phyto_VAL_MOnSTER_results/clustering_and_CLUMPs/df_CLUMPs_in_eff_val.tsv",
                           sep=",")
clumps_in_noneff = pd.read_csv("../demo/phyto_MOnSTER_results/clustering_and_CLUMPs/df_CLUMPs_in_non_eff_val.tsv",
                              sep=",")


FileNotFoundError: [Errno 2] No such file or directory: '../demo/phyto_VAL_MOnSTER_results/clustering_and_CLUMPs/df_CLUMPs_in_eff_val.tsv'

# CLUMPs EFFECTORS

In [5]:


# clumps per eff 
clumps_per_eff = {}
for p in clumps_in_eff["seq_id"]:
    clumps_per_eff[p] = [0] * len(np.unique(clumps_in_eff["CLUMP"]))
    act_df = clumps_in_eff[clumps_in_eff["seq_id"] == p]
    act_index = list(act_df.index)
    i = 0
    for c in act_df["CLUMP"]:
        clumps_per_eff[p][c] = act_df["0"].loc[act_index[i]]
        i += 1
clumps_feature_eff = pd.DataFrame(clumps_per_eff).transpose()
clumps_feature_eff.columns = [f"CLUMP{c}" for c in clumps_feature_eff]
clumps_feature_eff["name"] = clumps_feature_eff.index
clumps_feature_eff.to_csv("../demo/phyto_MOnSTER_results/clustering_and_CLUMPs/CLUMPs_as_feature_eff.tsv",
                             sep="\t",
                             index=False)
clumps_feature_eff

Unnamed: 0,CLUMP0,CLUMP1,CLUMP2,CLUMP3,CLUMP4,CLUMP5,CLUMP6,CLUMP7,CLUMP8,CLUMP9,CLUMP10,name
SAP11_A0A7M3UQ93,7,0,1,1,1,2,0,1,0,0,0,SAP11_A0A7M3UQ93
SAP11_A0A895HT12,7,0,1,1,1,2,0,1,0,0,0,SAP11_A0A895HT12
SAP11_A0A1V0PKN0,7,0,1,1,1,2,0,1,0,0,0,SAP11_A0A1V0PKN0
SAP11_A0A4Y5R0G3,7,0,1,1,1,2,0,1,0,0,0,SAP11_A0A4Y5R0G3
SAP11_A0A7H9SKF2,7,0,1,1,1,2,0,1,0,0,0,SAP11_A0A7H9SKF2
...,...,...,...,...,...,...,...,...,...,...,...,...
PHYL1_A0A0A8JCY8,0,1,1,0,0,1,0,0,1,1,0,PHYL1_A0A0A8JCY8
PHYL1_A0A0A8JCY1,1,1,1,1,0,1,0,0,1,1,0,PHYL1_A0A0A8JCY1
PHYL1_A0A0A8JCG5,1,1,1,1,0,1,0,0,1,1,0,PHYL1_A0A0A8JCG5
TENGU_Q2NJV8,1,0,0,0,0,0,0,0,1,0,0,TENGU_Q2NJV8


# CLUMPs NON-EFFECTORS

In [6]:
# clumps per non eff 
clumps_per_noneff = {}
for p in clumps_in_noneff["seq_id"]:
    clumps_per_noneff[p] = [0] * len(np.unique(clumps_in_eff["CLUMP"]))
    act_df = clumps_in_noneff[clumps_in_noneff["seq_id"] == p]
    act_index = list(act_df.index)
    i = 0
    for c in act_df["CLUMP"]:
        clumps_per_noneff[p][c] = act_df["0"].loc[act_index[i]]
        i += 1
clumps_feature_noneff = pd.DataFrame(clumps_per_noneff).transpose()
clumps_feature_noneff.columns = [f"CLUMP{c}" for c in clumps_feature_noneff]
clumps_feature_noneff["name"] = [ i.split("|")[1] for i in list(clumps_feature_noneff.index)]
clumps_feature_noneff.to_csv("../demo/phyto_MOnSTER_results/clustering_and_CLUMPs/CLUMPs_as_feature_noneff.tsv",
                             sep="\t",
                             index=False)
clumps_feature_noneff

Unnamed: 0,CLUMP0,CLUMP1,CLUMP2,CLUMP3,CLUMP4,CLUMP5,CLUMP6,CLUMP7,CLUMP8,CLUMP9,CLUMP10,name
sp|B3QZH0|RPOB_PHYMT,7,0,0,1,2,2,1,1,0,0,0,B3QZH0
sp|Q2NJZ2|SYFB_AYWBP,0,0,0,4,2,6,1,0,0,0,0,Q2NJZ2
sp|Q2NJ16|RPOC_AYWBP,5,0,0,2,1,5,0,1,2,0,0,Q2NJ16
sp|B3R0L6|DNAA_PHYMT,5,0,1,0,0,1,0,2,0,0,0,B3R0L6
sp|Q2NJ15|RPOB_AYWBP,5,0,0,0,2,2,1,0,2,0,0,Q2NJ15
...,...,...,...,...,...,...,...,...,...,...,...,...
sp|B1VA77|RRF_PHYAS,0,0,0,1,0,0,0,0,0,0,0,B1VA77
sp|Q2NIM2|RL31_AYWBP,0,0,0,1,0,0,0,0,0,0,0,Q2NIM2
sp|Q2NIW6|RL5_AYWBP,0,0,0,1,0,0,0,0,0,0,0,Q2NIW6
sp|Q2NIM3|RL28_AYWBP,0,0,0,1,0,0,0,0,0,0,0,Q2NIM3


# CLUMPs EFF + NON EFF

In [28]:
clumps_feature_eff["name"] = ["effector"] * len(clumps_feature_eff)
clumps_feature_noneff["name"] = ["non_effector"] * len(clumps_feature_noneff)
clumps_feature_eff_noneff = pd.concat([clumps_feature_eff, clumps_feature_noneff], axis=0, join="inner")
clumps_feature_eff_noneff.reset_index(drop=True, inplace=True)


# CLUMPs VALIDATION SET 

In [22]:
# training list of motifs
mot_train = pd.read_csv("/home/giulia/Workspace/PhytoPhD/effectors_analysis/MONSTER_PROMOCA_FEM_Giulia/MOnSTER/data/datasets/phyto_motifs_no_prof.txt",
                       header=None)
mot_val = pd.read_csv("/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_dataset_validazione20230117/val_eff_motifs20230117.txt",
                       header=None)

# obtain the training motifs in the validation set
intersect_train_val = list(set(list(mot_train[0])).intersection(list(mot_val[0])))
motifs_train_to_clump = pd.read_csv("/home/giulia/Workspace/PhytoPhD/effectors_analysis/MONSTER_PROMOCA_FEM_Giulia/MOnSTER/demo/phyto_MOnSTER_results/clustering_and_CLUMPs/df_motifs_CLUMPs.tsv",
                                   sep=",", header=0)

# motifs of training set present in validation set and corresponding training clump
val_clump_df = motifs_train_to_clump[motifs_train_to_clump["motif"].isin(intersect_train_val)]
# print(val_clump_df)


fasta_val_eff = SeqIO.parse("/home/giulia/Workspace/PhytoPhD/effectors_analysis/dataset_validazione20221110/uniprot_eff_putative_eff_TRULY_NEVER_SEEN_RF20221115.fasta",
                           "fasta")
fasta_val_noneff = SeqIO.parse("/home/giulia/Workspace/PhytoPhD/effectors_analysis/dataset_validazione20221110/val20221109_sel_neg.fasta",
                              "fasta")
fasta_val_neg_ids = pd.read_csv("/home/giulia/Workspace/PhytoPhD/effectors_analysis/dataset_validazione20221110/subset100_val20221109_ids_random.txt",
                           header=None)

# select 100 non effector proteins from ids seleted before 
noneff_val_seq = {}

for record in fasta_val_noneff:
    if record.id in list(fasta_val_neg_ids[0]):
        noneff_val_seq[record.id] = record.seq
    else:
        pass
    
# parse prosite to count occurrences of each clump 
prosite_val_eff = SeqIO.parse("/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_dataset_validazione20230117/prosite_val_eff_truly_never_seen20221115.fasta",
                         "fasta")
tmp_record_id = ""
occ_val_eff = {}
val_eff_number = 0
for record in prosite_val_eff:
    val_eff_number += 1
    if tmp_record_id != str(record.id.split("/")[0]):
        occ_val_eff[tmp_record_id] = {}
    else:
        if str(record.seq) not in list(occ_val_eff[tmp_record_id].keys()):
            occ_val_eff[tmp_record_id][str(record.seq)] = 1
        else:
            occ_val_eff[tmp_record_id][str(record.seq)] += 1

    tmp_record_id = str(record.id.split("/")[0])




clumps_occ_val = dict(zip(["CLUMP0", 'CLUMP1', 'CLUMP2', 'CLUMP3', 'CLUMP4', 'CLUMP5', 'CLUMP6',
       'CLUMP7', 'CLUMP8', 'CLUMP9', 'CLUMP10'], [[0]* val_eff_number] * 11))
# print(clumps_occ_val)
    
print(occ_val_eff)

KeyError: 'tr|A0A1Q1NH89|A0A1Q1NH89_9MOLU'

In [81]:

# searching in the first df the val motifs
# and add a column for each proteins 

# training clumps and the motifs belonging to it 
# mot_train = pd.read_csv("/home/giulia/Workspace/PhytoPhD/effectors_analysis/MONSTER_PROMOCA_FEM_Giulia/MOnSTER/data/datasets/phyto_motifs_no_prof.txt",
#                        header=None)
motifs_train_to_clump = pd.read_csv("/home/giulia/Workspace/PhytoPhD/effectors_analysis/MONSTER_PROMOCA_FEM_Giulia/MOnSTER/demo/phyto_MOnSTER_results/clustering_and_CLUMPs/df_motifs_CLUMPs.tsv",
                                   sep=",", header=0)
# validation motifs 
mot_val = pd.read_csv("/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_dataset_validazione20230117/val_eff_motifs20230117.txt",
                       header=None)

# intersection between the training motifs list and the validation motifs 
intersect_train_val = list(set(list(motifs_train_to_clump["motif"])).intersection(list(mot_val[0])))
# motifs of training set present in validation set and corresponding training clump
val_clump_df = motifs_train_to_clump[motifs_train_to_clump["motif"].isin(intersect_train_val)]

print(val_clump_df)

prosite_val_eff = SeqIO.parse("/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_dataset_validazione20230117/prosite_val_eff_truly_never_seen20221115.fasta",
                         "fasta")

# a list for the motifs, a list for the validation set sequence id associated with the motif
# and a list for the belonging clump of the motif 
motifs = []
seq_ids = []
clumps = []
for record in prosite_val_eff:
    if str(record.seq) in list(val_clump_df["motif"]):
        clumps.append(val_clump_df["CLUMP"].iloc[list(val_clump_df["motif"]).index(str(record.seq))])
        motifs.append(str(record.seq))
        seq_ids.append(str(record.id))
        
# create the DF like in MOnSTER 
df_clumps_in_val = pd.DataFrame({"motif": motifs, "CLUMP": clumps, "seq_id": seq_ids})
print(df_clumps_in_val)

# count occurrences of clumps in each validation set protein
clumps_in_valset = df_clumps_in_val[["CLUMP", "seq_id"]].value_counts().reset_index()
print(clumps_in_valset)

     Unnamed: 0     motif  CLUMP
1             1       TLK      1
4             4      TLEE      3
6             6       SLK      1
9             9      NFTI      6
11           11      SDPE      8
12           12      TESE      4
14           14      NYSE      9
15           15      TLKD      1
16           16  RDDEDDKY     10
18           18      SEKE      4
21           21      NSTR      7
24           24      STRD      7
25           25      TNID     11
28           28      SITD     11
30           30       TEK      4
33           33       SSK      2
35           35    GLQGSI     12
37           37      NNSV     11
41           41      NKTL     11
43           43      SNND      7
46           46      SSSE      2
47           47      TEKE      4
48           48      NITN     11
49           49      KKGS      8
50           50      TIEE      3
54           54       TFK      9
59           59      NATK      8
62           62       STK      2
63           63    GTSENQ      8
68        

In [94]:
# clumps per validation set
clumps_per_valset = {}


for p in list(clumps_in_valset["seq_id"]):
    clumps_per_valset[p] = [0] * len(np.unique(list(motifs_train_to_clump["CLUMP"])))
    act_df = clumps_in_valset[clumps_in_valset["seq_id"] == p]
    act_index = list(act_df.index)
    i = 0
    for c in act_df["CLUMP"]:
        clumps_per_valset[p][c] = act_df[0].loc[act_index[i]]
        i += 1
clumps_feature_valset = pd.DataFrame(clumps_per_valset).transpose()
clumps_feature_valset.columns = [f"CLUMP{c}" for c in clumps_feature_valset]
clumps_feature_valset["name"] = [ i.split("|")[1] for i in list(clumps_feature_valset.index)]
# clumps_feature_valset.to_csv("../demo/phyto_MOnSTER_results/clustering_and_CLUMPs/CLUMPs_as_feature_valset.tsv",
#                              sep="\t",
#                              index=False)
clumps_feature_valset

Unnamed: 0,CLUMP0,CLUMP1,CLUMP2,CLUMP3,CLUMP4,CLUMP5,CLUMP6,CLUMP7,CLUMP8,CLUMP9,CLUMP10,CLUMP11,CLUMP12,name
tr|A0A1S9LYK0|A0A1S9LYK0_9MOLU/106-113,0,0,0,0,0,0,0,0,0,0,2,0,0,A0A1S9LYK0
tr|A0A2S8NUU7|A0A2S8NUU7_9MOLU/104-111,0,0,0,0,0,0,0,0,0,0,2,0,0,A0A2S8NUU7
tr|A0A660HNE9|A0A660HNE9_ZIZJU/105-112,0,0,0,0,0,0,0,0,0,0,2,0,0,A0A660HNE9
tr|A0A1Q1NH89|A0A1Q1NH89_9MOLU/86-88,0,1,0,0,0,0,0,0,0,0,0,0,0,A0A1Q1NH89
tr|A0A1Q1NHA6|A0A1Q1NHA6_9MOLU/52-55,0,0,0,0,0,1,0,0,0,0,0,0,0,A0A1Q1NHA6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tr|A0A421NYW6|A0A421NYW6_9MOLU/202-205,0,0,0,0,1,0,0,0,0,0,0,0,0,A0A421NYW6
tr|A0A5N5XS43|A0A5N5XS43_9MOLU/48-50,0,0,0,0,1,0,0,0,0,0,0,0,0,A0A5N5XS43
tr|A0A5N5XS43|A0A5N5XS43_9MOLU/48-51,0,0,0,0,1,0,0,0,0,0,0,0,0,A0A5N5XS43
tr|A0A5N5XS43|A0A5N5XS43_9MOLU/55-57,0,0,0,0,1,0,0,0,0,0,0,0,0,A0A5N5XS43


## CLUMPs EXPLORATORY ANALYSIS

### CLUMPs distributions btw EFFECTORS and NON-EFFECTORS

In [55]:
# Barplots of means per clumps occurrences in eff and non-eff
fig = go.Figure()
clumps_occurrences_eff = []
clumps_occurrences_noneff = []
sig_clumps_diff = {}
for c in list(clumps_feature_eff.columns)[:-1]:
    clumps_occurrences_eff.append(np.mean(clumps_feature_eff[c]))
    clumps_occurrences_noneff.append(np.mean(clumps_feature_noneff[c]))
    tt = ttest_ind(clumps_feature_eff[c], clumps_feature_noneff[c])[1]
    if tt <= 0.05:
        sig_clumps_diff[c] = tt 
    else:
        pass
fig.add_trace(go.Bar(name="effectors", x=list(range(0, 11)), y=clumps_occurrences_eff))
fig.add_trace(go.Bar(name="non-effectors", x=list(range(0, 11)), y=clumps_occurrences_noneff))
print(sig_clumps_diff)
print(clumps_feature_eff['CLUMP1'].sum())
print(clumps_occurrences_eff)
print(clumps_occurrences_noneff)
fig.show()

{'CLUMP0': 3.2516019170495705e-06, 'CLUMP1': 3.01585651732876e-22, 'CLUMP2': 5.703845988809711e-52, 'CLUMP4': 1.7274299798276098e-06, 'CLUMP5': 0.0006712999282875999, 'CLUMP6': 0.00023064393861470508, 'CLUMP7': 6.131026040544991e-07, 'CLUMP8': 1.201293541368159e-27, 'CLUMP9': 1.725548199743355e-25}
48
[2.159090909090909, 0.5454545454545454, 0.9886363636363636, 0.9431818181818182, 0.4318181818181818, 1.2045454545454546, 0.2159090909090909, 0.4431818181818182, 0.625, 0.4659090909090909, 0.011363636363636364]
[1.189516129032258, 0.04838709677419355, 0.05241935483870968, 0.7943548387096774, 0.14919354838709678, 0.8225806451612904, 0.06451612903225806, 0.15725806451612903, 0.08064516129032258, 0.03225806451612903, 0.0]


In [9]:
# boxplots 
fig_box = go.Figure()
for c in list(clumps_feature_eff.columns)[:-1]:
    fig_box.add_trace(go.Box(y=clumps_feature_eff[c], name="effectors", marker={"color": "#FF4136"}))
    fig_box.add_trace(go.Box(y=clumps_feature_noneff[c], name="non-effectors", marker={"color": "#3D9970"}))
fig_box.update_layout(boxmode="group",
                     showlegend=False)
fig_box.show()

In [49]:
# PCA
pca = PCA(n_components=4)
components = pca.fit_transform(clumps_feature_eff_noneff[list(clumps_feature_eff_noneff.columns)[:-1]])
comp_to_vis = [1, 2]
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
fig_pca = px.scatter(components, x=comp_to_vis[0]-1, y=comp_to_vis[1]-1, color=clumps_feature_eff_noneff["name"])
fig_pca.update_xaxes(title=f"PC{comp_to_vis[0]} ({pca.explained_variance_ratio_[0] * 100:.1f}%)")
fig_pca.update_yaxes(title=f"PC{comp_to_vis[1]} ({pca.explained_variance_ratio_[1] * 100:.1f}%)")

for i, feature in enumerate(list(clumps_feature_eff_noneff.columns)[:-1]):
    fig_pca.add_shape(type="line", 
                     x0=0, y0=0,
                     x1=loadings[i, 0],
                     y1=loadings[i, 1])
    fig_pca.add_annotation(x=loadings[i, 0],
                          y=loadings[i, 1],
                          ax=0, ay=0,
                          xanchor="center",
                          yanchor="bottom",
                          text=feature)

fig_pca.show()