In [None]:
! pip install plotly 

In [37]:
import numpy as np
import pandas as pd
import os
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from numpy.linalg import det
from scipy.stats import ttest_ind
from sklearn.decomposition import PCA

# Input files

In [22]:
# INPUT FILES 
common_dir = "/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/"
eff_dataset = pd.read_csv(f"{common_dir}prediction_recup20220824/numeric_eff_feature_prediction.tsv",
                         sep="\t")
non_eff_dataset = pd.read_csv(f"{common_dir}prediction_recup20220824/numeric_non_eff_feature_prediction.tsv",
                             sep="\t")
dataset = pd.read_csv(f"{common_dir}prediction_recup20220824/numeric_eff_noneff_feature_prediction.tsv", 
                              sep="\t")
nf_cols_to_drop = []

for f in eff_dataset:
    if f not in list(non_eff_dataset.columns):
        non_eff_dataset[f] = [0]*len(non_eff_dataset)
for nf in non_eff_dataset:
    if nf not in list(eff_dataset):
        nf_cols_to_drop.append(nf)
non_eff_dataset = non_eff_dataset.drop(columns=nf_cols_to_drop, axis=1)
non_eff_dataset = non_eff_dataset[list(eff_dataset.columns)]
       

True

# Features

In [8]:
# RECUP OF FEATURES
base_features_names = ['sequence length', 'signal peptide', 'transmembrane domain',
                     'aa in tr domain', 'first 60 aa', 'prob N-in',
                     'warning signal sequence', 'MobiDB-lite', 'ASN_GLYCOSYLATION',
                     'CAMP_PHOSPHO_SITE', 'CK2_PHOSPHO_SITE', 'PKC_PHOSPHO_SITE', 'MYRISTYL',
                     'PROKAR_LIPOPROTEIN_L=0', 'TYR_PHOSPHO_SITE_1', 'TYR_PHOSPHO_SITE_2',
                     'AMIDATION', 'EF_HAND_1', 'ASN_RICH_L=0']

# T-test

In [29]:
# STATISTICAL SIGNIFICANT DIFFERENCE BETWEEN +/- SAMPLES IN TERMS OF FEATURE DISTRIBUTION
## T-test to compare the means of the eff_feature_distribution and non_eff_feature_distribution
sig = {}
for f in base_features_names:
    tt = ttest_ind(eff_dataset[f], non_eff_dataset[f])[1]
    if tt <= 0.05:
        sig[f] = f"{tt:.4f}"
    else:
        pass

# Pearson Correlation Coefficient
## for each pair of features


In [53]:
dataset.corr()
# dataset[["transmembrane domain", "warning signal sequence"]]

Unnamed: 0,sequence length,signal peptide,transmembrane domain,aa in tr domain,first 60 aa,prob N-in,warning signal sequence,MobiDB-lite,ASN_GLYCOSYLATION,CAMP_PHOSPHO_SITE,CK2_PHOSPHO_SITE,PKC_PHOSPHO_SITE,MYRISTYL,PROKAR_LIPOPROTEIN_L=0,TYR_PHOSPHO_SITE_1,TYR_PHOSPHO_SITE_2,AMIDATION,EF_HAND_1,ASN_RICH_L=0
sequence length,1.0,-0.379859,-0.186416,-0.179865,-0.288955,-0.595215,-0.291481,-0.079571,-0.076468,-0.142503,0.096136,-0.056673,0.230737,-0.038949,0.024122,0.018597,-0.031357,-0.031743,-0.028751
signal peptide,-0.379859,1.0,0.619331,0.598837,0.845275,0.659062,0.849879,0.151167,0.305002,0.154714,0.301528,-0.09017,-0.298921,0.134649,-0.026242,0.007412,-0.055176,0.056425,0.080687
transmembrane domain,-0.186416,0.619331,1.0,0.989459,0.800898,0.412232,0.759405,0.057756,0.247425,-0.014185,0.263708,-0.186025,-0.038349,0.0798,-0.040407,-0.000634,0.00386,0.056353,0.056353
aa in tr domain,-0.179865,0.598837,0.989459,1.0,0.790504,0.398206,0.743682,0.053044,0.229334,-0.006433,0.247686,-0.173485,-0.026857,0.076575,-0.053406,-0.009295,0.011357,0.05295,0.046711
first 60 aa,-0.288955,0.845275,0.800898,0.790504,1.0,0.604435,0.992734,0.112423,0.375595,-0.005821,0.476188,-0.18365,-0.324904,0.124606,-0.044296,0.009506,-0.064495,0.086521,0.07713
prob N-in,-0.595215,0.659062,0.412232,0.398206,0.604435,1.0,0.62198,-0.04361,0.299459,0.163697,0.219733,-0.136568,-0.396043,0.099784,-0.028276,0.007585,-0.050703,0.053094,0.07167
warning signal sequence,-0.291481,0.849879,0.759405,0.743682,0.992734,0.62198,1.0,0.101689,0.392741,-0.00412,0.497977,-0.201975,-0.346241,0.127039,-0.032224,0.021023,-0.084154,0.089713,0.089713
MobiDB-lite,-0.079571,0.151167,0.057756,0.053044,0.112423,-0.04361,0.101689,1.0,-0.106468,0.024874,-0.137246,0.254514,-0.018696,-0.028139,-0.052947,-0.077008,0.156534,0.100116,-0.019871
ASN_GLYCOSYLATION,-0.076468,0.305002,0.247425,0.229334,0.375595,0.299459,0.392741,-0.106468,1.0,-0.151522,0.346008,-0.407399,-0.336834,0.077711,-0.033886,0.165057,-0.202218,-0.018866,0.043916
CAMP_PHOSPHO_SITE,-0.142503,0.154714,-0.014185,-0.006433,-0.005821,0.163697,-0.00412,0.024874,-0.151522,1.0,-0.218067,-0.095733,-0.074847,-0.046095,-0.097149,-0.037876,-0.067086,-0.032551,-0.032551


# Box-plots 

In [25]:
# BOX PLOT - FEATURE DISTRIBUTION COMPARISON BETWEEN CLASSES
for f in base_features_names:
    fig_box = go.Figure()
    
    # a boxplot for each feature comparing effectors and non-effectors
    fig_box.add_trace(go.Box(y=eff_dataset[f], name="effectors"))
    fig_box.add_trace(go.Box(y=non_eff_dataset[f], name="non_effectors"))
    fig_box.update_layout(title=f)
    fig_box.show()
    # fig_box.write_image(f"{common_dir}features_exploratory_analysis/{f}-feature_exploratory_analysis_box.svg",
    #                     width=1920, height=1080)

# Distplots 
#### rivedere per errore determinante = 0 

In [49]:
 # DISTPLOT - FEATURE DISTRIBUTION COMPARISON BETWEEN CLASSES
for f in base_features_names:
    fig_dist = ff.create_distplot([eff_dataset[f], non_eff_dataset[f]], ["effectors", "non_effectors"])
    fig_dist.update_layout(title=f)
    
    fig_dist.show()
    # fig_dist.write_image(f"{common_dir}features_exploratory_analysis/{f}-feature_exploratory_analysis_distplot.svg",
    #                     width=1920, height=1080)

LinAlgError: singular matrix

# PCA


In [56]:
## PCA

features = dataset[list(dataset.columns)[3:]]
pca = PCA(n_components=2)
components = pca.fit_transform(features)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
fig_pca = px.scatter(components, x=0, y=1, color=dataset["ID"]) # 0 = first PC, 1 = second PC
fig_pca.update_xaxes(title=f"PC1 ({pca.explained_variance_ratio_[0] * 100:.1f}%)")
fig_pca.update_yaxes(title=f"PC2 ({pca.explained_variance_ratio_[1] * 100:.1f}%)")
for i, feature in enumerate(list(features.columns)):
    fig_pca.add_shape(type="line",
                      x0=0, y0=0,
                      x1=loadings[i, 0],
                      y1=loadings[i, 1])
    fig_pca.add_annotation(x=loadings[i, 0],
                           y=loadings[i, 1],
                           ax=0, ay=0,
                           xanchor="center",
                           yanchor="bottom",
                           text=feature)
fig_pca.show()