In [1]:
! pip3 install pandas joblib scipy scikit-learn biopython imblearn




In [10]:
import pandas as pd
import numpy as np
import joblib
import argparse
from sklearn.ensemble import RandomForestClassifier  # random forest model
from sklearn.model_selection import GridSearchCV
from sklearn import metrics  # to calculate the accuracy of the model
from sklearn.metrics import (precision_recall_curve, PrecisionRecallDisplay, auc)
from sklearn.model_selection import StratifiedKFold
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
from imblearn.under_sampling import RandomUnderSampler
import random

In [43]:
# INPUTs
eff_predictions = "/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_eff_202202/effector_protein_feature_prediction.tsv"
non_eff_predictions = "/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/prediction_tools/pred_non_eff_selection202206/202206non_effector_protein_feature_prediction.tsv"
real_data = "/home/giulia/Workspace/PhytoPhD/caliag/PMs/PM7/features_prediction/CaPm7_contig01_protein_features_prediction20220727.tsv"


# OUTPUTs
model_output_path = "/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/random_forest/prove_20220727"

In [27]:
# DATASET 

## EFF PREDICTIONS DF for Random Forest
eff_pred_df = pd.read_csv(eff_predictions, sep="\t", header=0)
eff_pred_df["name"] = ["effectors"] * len(eff_pred_df)
eff_pred_df = eff_pred_df.drop(columns=["ID", "organism"], axis=1)

## NON-EFF PREDICTIONS DF for Random Forest
non_eff_pred_df = pd.read_csv(non_eff_predictions, sep="\t", header=0)
non_eff_pred_df["name"] = ["non_effectors"] * len(non_eff_pred_df)

## COMBINE DFs (taking only effector motifs)
for col in eff_pred_df.columns[1:]:
    if col not in non_eff_pred_df.columns[1:]:
        # if an effector motifs is not present in non effectors, a new column will be added to the non_effector_df
        # having value False or 0 in this case
        non_eff_pred_df[col] = [0] * len(non_eff_pred_df)
    else:
        pass

# CONCAT AND CLEAN UP THE DATAFRAME FOR THE TRAINING
# "inner" will take only common cols
dataset = pd.concat([eff_pred_df, non_eff_pred_df], ignore_index=True, join="inner")
dataset.replace([True, False], [1, 0], inplace=True)

# drop the column/s having the same value for all samples == ZERO VARIANCE / ALMOST ZERO VARIANCE
datset = dataset.drop(columns="EF_HAND_1_L=(-1)", axis=1)

dataset 

Unnamed: 0,name,sequence length,signal peptide,transmembrane domain,Phob signal peptide,Phob transmembrane domain,disordered regions,ASN_GLYCOSYLATION,CAMP_PHOSPHO_SITE,CK2_PHOSPHO_SITE,PKC_PHOSPHO_SITE,MYRISTYL,TYR_PHOSPHO_SITE_1,AMIDATION,EF_HAND_1_L=(-1)
0,effectors,125,1,1,0,1,0,1,1,1,0,0,0,0,0
1,effectors,125,1,1,0,1,0,1,1,1,0,0,0,0,0
2,effectors,125,1,1,1,0,0,1,1,1,0,0,0,0,0
3,effectors,125,1,1,1,0,0,1,1,1,1,0,0,0,0
4,effectors,125,1,1,1,0,0,1,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,non_effectors,54,0,0,0,0,0,0,0,1,0,0,0,0,0
336,non_effectors,86,0,0,0,0,0,0,0,0,1,1,0,0,0
337,non_effectors,124,0,0,0,0,0,1,0,0,1,1,0,0,0
338,non_effectors,88,0,0,0,0,0,1,0,0,1,0,0,0,0


In [35]:
## CREATE THE RANDOM FOREST CLASSIFIER
### STRATIFIED SHUFFLE SPLIT K-fold CROSS-VALIDATION
#### ITERATIVELY ESTIMATE THE BEST NUMBER OF DECISION TREES IN THE FOREST FOR EACH FOLD

# shuffling is useful because positive and negative elements are ordered in the dataframe and this will help to
# randomly pick the elements for each fold

skf5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
i = 1

# take trace of all evaluation parameters for each k-fold
kf_acc = []
kf_auc = []
kf_prec = []
kf_rec = []
kf_best_params = []

skf5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
i = 1

# SPLITTING
for train_index, test_index in skf5.split(common_df, list(common_df["name"])):
    
    x_train = common_df.iloc[train_index].loc[:, list(common_df.columns)[1:]]  # all cols except the name_col
    x_test = common_df.iloc[test_index][list(common_df.columns)[1:]]
    y_train = common_df["name"].iloc[train_index]  # only the name_col
    y_test = common_df["name"].iloc[test_index]

    # kth-fold RANDOM FOREST
    clf = RandomForestClassifier(n_estimators=100, random_state=42)  # n_estimators = number of trees in the forest
    
    ## SELECT BEST n_estimators on the TRAINING SET
    print("BEGIN best parameters-n_trees selection for Random Forest")
    params_to_test = {"n_estimators": [50, 75, 100, 1000, 5000]}
    grid_search = GridSearchCV(clf, params_to_test, n_jobs=4)
    grid_search.fit(x_train, y_train)
    print("FINISH - best parameters-n_trees selection")
    best_params = grid_search.best_params_
    clf = RandomForestClassifier(**best_params, random_state=42)
    kf_best_params.append(best_params["n_estimators"])
    ## TRAIN THE MODEL USING kth-FOLD TRAINING SETS
    clf.fit(x_train, y_train)
    
    ## APPLY THE MODEL ON TEST SET
    y_pred = clf.predict(x_test)

    ## ACCURACY
    kth_accuracy = metrics.accuracy_score(y_test, y_pred)
    print(f"Training phase accuracy for the fold no. {i}: {kth_accuracy}")
    kf_acc.append(kth_accuracy)

    ## F-measure
    f_score = metrics.f1_score(y_test, y_pred, average="macro")
    print(f"\nF-Measure:\t{f_score}")
    
    ### predict probability
    y_pred_prob = clf.predict_proba(x_test)

    ## PRECISION AND RECALL
    y_test_hot_encode = [0 if el == "non_effectors" else 1 for el in y_test]
    y_pred_hot_encode = [0 if el == "non_effectors" else 1 for el in y_pred]
    precision_perc = metrics.precision_score(y_test_hot_encode, y_pred_hot_encode, pos_label=1)
    recall_perc = metrics.recall_score(y_test_hot_encode, y_pred_hot_encode, pos_label=1)
    print(f"Precision (TP/TP+FP) on training phase: {precision_perc}")
    print(f"Recall (TP/TP+FN) on training phase: {recall_perc}")
    precision, recall, _ = precision_recall_curve(y_test_hot_encode, y_pred_hot_encode, pos_label=1)
    print(f"Precision coordinates: {precision}\nRecall(Sensitivity) coordinates: {recall}")
    kf_prec.append(precision_perc)
    kf_rec.append(recall_perc)
    ## Area Under the Curve
    pr_auc = auc(recall, precision)
    kf_auc.append(pr_auc)
    # disp = PrecisionRecallDisplay(precision=precision, recall=recall)
    # disp.plot()
    print(f"Area under the curve is: {pr_auc}")

    ## CONFUSION MATRIX
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)
    print(f"\nconfusion matrix on k-fold test set\n{conf_matrix}")

    ## FEATURE IMPORTANCE
    feat_imp = pd.Series(clf.feature_importances_, index=list(common_df.columns)[1:]).sort_values(ascending=False)
    print(f"Feature importance:\n{feat_imp}")
    i += 1

print(f"Averaged accuracy for {i-1}-fold cross validation: {np.mean(kf_acc)}")
print(f"Averaged precision for {i-1}-fold cross validation: {np.mean(kf_prec)}")
print(f"Averaged recall for {i-1}-fold cross validation: {np.mean(kf_rec)}")
print(f"Averaged best number of tree in the forest: {np.mean(kf_best_params)}")


BEGIN best parameters-n_trees selection for Random Forest
FINISH - best parameters-n_trees selection
Training phase accuracy for the fold no. 1: 0.9852941176470589

F-Measure:	0.9807637906647807
Precision (TP/TP+FP) on training phase: 1.0
Recall (TP/TP+FN) on training phase: 0.9444444444444444
Precision coordinates: [0.26470588 1.         1.        ]
Recall(Sensitivity) coordinates: [1.         0.94444444 0.        ]
Area under the curve is: 0.9795751633986928

confusion matrix on k-fold test set
[[17  1]
 [ 0 50]]
Feature importance:
signal peptide               0.296711
Phob signal peptide          0.244075
transmembrane domain         0.124910
sequence length              0.108948
ASN_GLYCOSYLATION            0.050240
CK2_PHOSPHO_SITE             0.042070
disordered regions           0.031995
Phob transmembrane domain    0.031890
MYRISTYL                     0.030327
PKC_PHOSPHO_SITE             0.022845
CAMP_PHOSPHO_SITE            0.008583
TYR_PHOSPHO_SITE_1           0.004042
AMI

In [42]:
rf_classifier = RandomForestClassifier(n_estimators=int(np.mean(kf_best_params)), random_state=42)
rf_classifier.fit(common_df[list(common_df.columns)[1:]], common_df["name"])
joblib.dump(rf_classifier, f"{model_output_path}/nb_random_forest_best_model.pkl")

['/home/giulia/Workspace/PhytoPhD/effectors_analysis/classification/random_forest/prove_20220727/nb_random_forest_best_model.pkl']

In [56]:
# PARSE THE REAL DATA
real_data_df = pd.read_csv(real_data, sep="\t", header=0)
real_data_df.replace([True, False], [1, 0], inplace=True)
cols_to_drop = []
for col in list(real_data_df.columns):
    if col not in list(eff_pred_df.columns):
        cols_to_drop.append(col)
    else:
        pass
    
real_data_df = real_data_df.drop(columns=cols_to_drop, axis=1)

for col in list(eff_pred_df.columns)[1:]:
    if col not in real_data_df:
        real_data_df[col] = [0] * len(real_data_df)
    else:
        pass

real_data_df = real_data_df.drop(columns="EF_HAND_1_L=(-1)", axis=1)

In [61]:
# APPLY THE MODEL TO REAL DATA
clf = rf_classifier.predict(real_data_df[list(real_data_df.columns)[1:]])
clf_df = pd.DataFrame(list(zip(list(real_data_df["name"]), clf)),
                              columns=["name", "RF_model_classification"])

print(clf_df["RF_model_classification"].value_counts())
clf_df

non_effectors    431
effectors         31
Name: RF_model_classification, dtype: int64


Feature names must be in the same order as they were in fit.



Unnamed: 0,name,RF_model_classification
0,CaPm7_contig01_0001,effectors
1,CaPm7_contig01_0002,non_effectors
2,CaPm7_contig01_0003,non_effectors
3,CaPm7_contig01_0004,non_effectors
4,CaPm7_contig01_0005,non_effectors
...,...,...
457,CaPm7_contig01_0458,effectors
458,CaPm7_contig01_0459,non_effectors
459,CaPm7_contig01_0460,effectors
460,CaPm7_contig01_0461,non_effectors
