In [1]:
import pandas as pd
import numpy as np
import joblib
import argparse
from sklearn.ensemble import RandomForestClassifier  # random forest model
from sklearn.model_selection import GridSearchCV
from sklearn import metrics  # to calculate the accuracy of the model
from sklearn.metrics import (precision_recall_curve, PrecisionRecallDisplay, auc)
from sklearn.model_selection import StratifiedKFold
from sklearn.inspection import permutation_importance
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
from imblearn.under_sampling import RandomUnderSampler
import random

In [74]:
# LOAD THE MODEL
common_dir = "/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/"
# data_predictions = f"{common_dir}prediction_tools/pred_dataset_validazione20220831/valset_clumps_feature_preditions20220831.tsv"
eff_predictions = "/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_dataset_validazione20221110/feature_table_eff/feature_table_eff_truly_never_seen20221115.tsv"
non_eff_predictions = "/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_dataset_validazione20221110/feature_table_random/feat_tab_sub100_val20221109_random.tsv"
loaded_model = joblib.load(f"{common_dir}random_forest/rf_sp_tm_mb_mod20220816/nb_random_forest_best_model.pkl")

In [69]:
features = ['name', 'sequence length', 'signal peptide', 'transmembrane domain', 'aa in tr domain',
       'first 60 aa', 'prob N-in', 'warning signal sequence', 'MobiDB-lite', 'ASN_GLYCOSYLATION',
        'CAMP_PHOSPHO_SITE', 'CK2_PHOSPHO_SITE', 'PKC_PHOSPHO_SITE', 'MYRISTYL', 'PROKAR_LIPOPROTEIN_L=0', 
            'TYR_PHOSPHO_SITE_1', 'TYR_PHOSPHO_SITE_2', 'AMIDATION', 'EF_HAND_1', 'ASN_RICH_L=0']

In [100]:
# FOR SINGLE CLASS USAGE 
class_name = "effectors"
class_file = "/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_dataset_validazione20221110/feature_table_eff/feature_table_eff_truly_never_seen20221115.tsv"

data_pred_df_original = pd.read_csv(class_file, sep="\t", header=0)
data_pred_df = pd.read_csv(class_file, sep="\t", header=0)
data_pred_df["name"] = [class_name] * len(data_pred_df)
data_pred_df.replace([True, False], ["1", "0"], inplace=True)

In [89]:
# PUTATIVE EFFECTORS EXPLICITED
eff_pred_df_original = pd.read_csv(eff_predictions, sep="\t", header=0)
non_eff_pred_df_original = pd.read_csv(non_eff_predictions, sep="\t", header=0)
data_pred_df_original = pd.concat([eff_pred_df_original, non_eff_pred_df_original], ignore_index=True, join="inner")
data_pred_df_original.replace([True, False], ["1", "0"], inplace=True)

print(data_pred_df_original)
ids = []

with open("/home/giulia/Workspace/PhytoPhD/effectors_analysis/dataset_validazione20221110/uniprot-download_true_format_list_query__28Putative_29-2022.11.15-15.57.14.63.list", 
          "r") as only_putative_ids: 
    asd = only_putative_ids.readlines()
    for el in asd:
        ids.append(el.replace("\n", ""))
    print(ids)

           name  sequence length  signal peptide  transmembrane domain  \
0    A0A1Q1NH89              112           0.000                     1   
1    A0A1Q1NH90              112           0.000                     1   
2    A0A1Q1NH91              112           0.000                     1   
3    A0A1Q1NH92              112           0.000                     1   
4    A0A1Q1NH98              112           0.000                     1   
..          ...              ...             ...                   ...   
153      Q6YR72              234           0.472                     0   
154      R4RKY6              318           0.254                     0   
155      R4RNE6              156           0.153                     0   
156      R4RXP0              486           0.196                     0   
157      R4S1N5              144           0.137                     0   

0           22.14889     22.14889    0.85374                       0   
1           22.14889     22.14889    0.

In [70]:
# CONCAT EFF non-EFF VALIDATION SET 


original_protein_names = pd.concat([pd.read_csv(eff_predictions, sep="\t", header=0)[["name"]], 
                                    pd.read_csv(non_eff_predictions, sep="\t", header=0)[["name"]]], 
                                   ignore_index=True, join="inner")

## EFF PREDICTIONS DF for Random Forest
eff_pred_df = pd.read_csv(eff_predictions, sep="\t", header=0)
eff_pred_df["name"] = ["effectors"] * len(eff_pred_df)

## NON-EFF PREDICTIONS DF for Random Forest
non_eff_pred_df = pd.read_csv(non_eff_predictions, sep="\t", header=0)
non_eff_pred_df["name"] = ["non_effectors"] * len(non_eff_pred_df)

## COMBINE DFs (taking only effector motifs)
for col in eff_pred_df.columns[1:]:
    if col not in non_eff_pred_df.columns[1:]:
        # if an effector motifs is not present in non effectors, a new column will be added to the non_effector_df
        # having value False or 0 in this case
        non_eff_pred_df[col] = [0] * len(non_eff_pred_df)
    else:
        pass

for feature in features:
    if feature not in list(eff_pred_df.columns):
        eff_pred_df[feature] = [0] * len(eff_pred_df)
    if feature not in list(non_eff_pred_df.columns):
        non_eff_pred_df[feature] = [0] * len(non_eff_pred_df)
        
# CONCAT AND CLEAN UP THE DATAFRAME FOR THE TRAINING
# "inner" will take only common cols
data_pred_df = pd.concat([eff_pred_df, non_eff_pred_df], ignore_index=True, join="inner")
data_pred_df.replace([True, False], ["1", "0"], inplace=True)
print(data_pred_df)

              name  sequence length  signal peptide  transmembrane domain  \
0        effectors              112           0.000                     1   
1        effectors              112           0.000                     1   
2        effectors              112           0.000                     1   
3        effectors              112           0.000                     1   
4        effectors              112           0.000                     1   
..             ...              ...             ...                   ...   
153  non_effectors              234           0.472                     0   
154  non_effectors              318           0.254                     0   
155  non_effectors              156           0.153                     0   
156  non_effectors              486           0.196                     0   
157  non_effectors              144           0.137                     0   

0           22.14889     22.14889    0.85374                       0   
1  

In [101]:
# PARSE THE REAL DATA

# data_pred_df = pd.read_csv(data_predictions, sep="\t", header=0)
# data_pred_df.replace([True, False], ["1", "0"], inplace=True)

cols_to_drop = []

for col in list(data_pred_df.columns):
    if col not in features:
        cols_to_drop.append(col)
    else:
        pass
    
data_pred_df = data_pred_df.drop(columns=cols_to_drop, axis=1)

for col in features:
    if col not in data_pred_df.columns:
        data_pred_df[col] = [0] * len(data_pred_df)
    else:
        pass
    
data_pred_df = data_pred_df[features]
len(data_pred_df)

58

In [102]:
# PROB ON PREDICITONS
y_proba = loaded_model.predict_proba(data_pred_df[list(data_pred_df.columns)[1:]])
pred_proba_df = pd.DataFrame(y_proba, columns=loaded_model.classes_)

pred_proba_df["name"] = original_protein_names
pred_proba_df["known_clf"] = data_pred_df["name"]
pred_proba_df.to_csv(f"{common_dir}random_forest/rf_dataset_validazione_sp_tm_mb_mod20221116/predictions_probability_on_effTNS_20221116.tsv",
                    sep="\t",
                    index=True)

pred_proba_df

Unnamed: 0,effectors,non_effectors,name,known_clf
0,0.569355,0.430645,A0A1Q1NH89,effectors
1,0.569355,0.430645,A0A1Q1NH90,effectors
2,0.569355,0.430645,A0A1Q1NH91,effectors
3,0.569355,0.430645,A0A1Q1NH92,effectors
4,0.569355,0.430645,A0A1Q1NH98,effectors
5,0.569355,0.430645,A0A1Q1NH99,effectors
6,0.569355,0.430645,A0A1Q1NHA1,effectors
7,0.569355,0.430645,A0A1Q1NHA2,effectors
8,0.569355,0.430645,A0A1Q1NHA3,effectors
9,0.569355,0.430645,A0A1Q1NHA5,effectors


In [103]:
# APPLY TO REAL DATA

data_pred_df_putative_exp_col = []

for i in range(len(data_pred_df_original)):
    if data_pred_df_original["name"].iloc[i] in ids:
        data_pred_df_putative_exp_col.append("putative_effectors")
    else:
        data_pred_df_putative_exp_col.append("effectors")
        
clf = loaded_model.predict(data_pred_df[list(data_pred_df.columns)[1:]])
clf_df = pd.DataFrame(list(zip(list(data_pred_df_original["name"]), data_pred_df_putative_exp_col, clf)),
                              columns=["uniprot_name", "class", "RF_model_classification"])

print(clf_df["RF_model_classification"].value_counts())
clf_df.to_csv(f"{common_dir}random_forest/rf_dataset_validazione_sp_tm_mb_mod20221116/clf_on_effTNS_20221116.tsv",
             sep="\t",
             index=False)

acc = metrics.accuracy_score(data_pred_df["name"], clf)
conf_matrix = metrics.confusion_matrix(data_pred_df["name"], clf)
known_hot_encode = [0 if el == "non_effectors" else 1 for el in data_pred_df["name"]]
clf_hot_encode = [0 if el == "non_effectors" else 1 for el in clf]
precision_perc = metrics.precision_score(known_hot_encode, clf_hot_encode, pos_label=1) # calculated on positive class
recall_perc = metrics.recall_score(known_hot_encode, clf_hot_encode, pos_label=1) # calculated on positive class
print(f"accuracy: {acc}")
print(f"conf_matrix:\n{conf_matrix}")
print(f"precision: {precision_perc}")
print(f"recall: {recall_perc}")

print(clf_df)

with open(f"{common_dir}random_forest/rf_dataset_validazione_sp_tm_mb_mod20221116/summary_performances_on_effTNS_20221116.tsv", "w") as out:
    out.write(f"{list(clf_df['RF_model_classification'].value_counts().index)[0]}\t{clf_df['RF_model_classification'].value_counts()[0]}\n")
    out.write(f"{list(clf_df['RF_model_classification'].value_counts().index)[1]}\t{clf_df['RF_model_classification'].value_counts()[1]}\n")
    out.write(f"accuracy\t{acc}\nconf_matrix:\n{conf_matrix}\nprecision\t{precision_perc}\nrecall\t{recall_perc}")    

effectors        29
non_effectors    29
Name: RF_model_classification, dtype: int64
accuracy: 0.5
conf_matrix:
[[29 29]
 [ 0  0]]
precision: 1.0
recall: 0.5
   uniprot_name               class RF_model_classification
0    A0A1Q1NH89           effectors               effectors
1    A0A1Q1NH90           effectors               effectors
2    A0A1Q1NH91           effectors               effectors
3    A0A1Q1NH92           effectors               effectors
4    A0A1Q1NH98           effectors               effectors
5    A0A1Q1NH99           effectors               effectors
6    A0A1Q1NHA1           effectors               effectors
7    A0A1Q1NHA2           effectors               effectors
8    A0A1Q1NHA3           effectors               effectors
9    A0A1Q1NHA5           effectors               effectors
10   A0A1Q1NHA6           effectors               effectors
11   A0A1Q1NHA7           effectors               effectors
12   A0A1S2NIQ0           effectors               effectors
13 

In [8]:
########################################## OLD #################################################################
# APPLY TO known effectors DATA
known = ["effectors"] * len(pred_data_eff)
clf = loaded_model.predict(pred_data_eff[list(pred_data_eff.columns)[1:]])
clf_df = pd.DataFrame(list(zip(list(pred_data_eff["name"]), clf)),
                              columns=["name", "RF_model_classification"])

print(clf_df["RF_model_classification"].value_counts())
# clf_df.to_csv("/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/random_forest/rf_dataset_validazione20220805/putative_eff_eff_clf.tsv",
#              sep="\t",
#              index=False)


acc = metrics.accuracy_score(known, clf)
conf_matrix = metrics.confusion_matrix(known, clf)
known_hot_encode = [0 if el == "non_effectors" else 1 for el in known]
clf_hot_encode = [0 if el == "non_effectors" else 1 for el in clf]
precision_perc = metrics.precision_score(known_hot_encode, clf_hot_encode, pos_label=1)
recall_perc = metrics.recall_score(known_hot_encode, clf_hot_encode, pos_label=1)
print(f"accuracy: {acc}")
print(f"conf_matrix:\n{conf_matrix}")
print(f"precision: {precision_perc}")
print(f"recall: {recall_perc}")

effectors        26
non_effectors    12
Name: RF_model_classification, dtype: int64
accuracy: 0.6842105263157895
conf_matrix:
[[26 12]
 [ 0  0]]
precision: 1.0
recall: 0.6842105263157895


In [None]:
clf_fit = loaded_model.fit(pred_data_eff[list(pred_data_eff.columns)[1:]], pred_data_eff["name"])
perm_import = permutation_importance(clf_fit, pred_data_eff[list(pred_data_eff.columns)[1:]], pred_data_eff["name"],
                                    n_repeats=30, random_state=42, scoring='accuracy')
perm_import

In [None]:
### HYDROPHOBIC PROFILE
l = np.array([1, 2, 4, 10, 53, 8, 20, 88])
l1 = np.array([5, 7, 8, 1])

l_mean = l.mean()
l_std = l.std()
l1_mean = l1.mean()
l1_std = l1.std()

# l_1 = (l*l_mean)/l_std
# l1_1 = (l1*l1_mean)/l1_std

l = ((l-l.min()) / (l.max() - l.min())) / len(l)
l1 = ((l1-l1.min()) / (l1.max()-l.min())) / len(l1)

# l = l/len(l)
# l1 = l1/len(l1)
bins = [[l[:5].mean(), l[5:].mean()], [l1[:3].mean(), l1[3:].mean()]]
print(l)
print(l1)
# print(l_1)
# print(l1_1)
print(bins)

In [20]:
# effectors 
# no direct predictions, take the corresponding rows from the putative_eff_eff feature table
data_eff = SeqIO.parse("/home/giulia/Workspace/PhytoPhD/effectors_analysis/dataset_validazione20220805/uniprot_effectors_never_seen_byRF20220805.fasta", "fasta")
name_eff = []
for record in data_eff:
    name = record.id.split("|")[1]
    name_eff.append(name)

name_eff_putative_eff = list(data_pred_df["name"])
index = []
for el in name_eff:
    if el in name_eff_putative_eff:
        index.append(name_eff_putative_eff.index(el))
        
pred_data_eff = data_pred_df.iloc[index]

len(pred_data_eff)


          name  sequence length  signal peptide  aa in tr domain  first 60 aa  \
0   A0A0G7ZN61              302           0.405          0.08054      0.00038   
1   A0A1Q1NH89              112           0.886         22.14889     22.14889   
2   A0A1Q1NH90              112           0.886         22.14889     22.14889   
3   A0A1Q1NH91              112           0.886         22.14889     22.14889   
4   A0A1Q1NH92              112           0.886         22.14889     22.14889   
5   A0A1Q1NH98              112           0.886         22.14889     22.14889   
6   A0A1Q1NH99              112           0.886         22.14889     22.14889   
7   A0A1Q1NHA1              112           0.886         22.14889     22.14889   
8   A0A1Q1NHA2              112           0.886         22.14889     22.14889   
9   A0A1Q1NHA3              112           0.886         22.14889     22.14889   
10  A0A1Q1NHA5              112           0.886         22.14889     22.14889   
11  A0A1Q1NHA6              

38