In [1]:

import os
import time
import math
import json
import pickle

import statistics

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from os import listdir
from os.path import join
from os.path import isfile

from tqdm import tqdm

from itertools import combinations

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import datasets
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score


from libsvm.svmutil import svm_problem
from libsvm.svmutil import svm_parameter
from libsvm.svmutil import svm_train
from libsvm.svmutil import svm_predict
from libsvm.svmutil import evaluations

from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

from Function import svm_function

In [2]:
def lnc_and_m(lnc_pred_y, lnc_pred_pro, m_pred_y, m_pred_pro):
    pred_y = np.logical_and(lnc_pred_y, m_pred_y)
    decision_values = np.where(pred_y, np.max(np.array([lnc_pred_pro, m_pred_pro]), axis=0), np.min(np.array([lnc_pred_pro, m_pred_pro]), axis=0))
    return pred_y, decision_values

def lnc_or_m(lnc_pred_y, lnc_pred_pro, m_pred_y, m_pred_pro):
    pred_y = np.logical_or(lnc_pred_y, m_pred_y)
    decision_values = np.where(pred_y, np.max(np.array([lnc_pred_pro, m_pred_pro]), axis=0), np.min(np.array([lnc_pred_pro, m_pred_pro]), axis=0))
    return pred_y, decision_values

def lnc_high_m(lnc_pred_y, lnc_pred_pro, m_pred_y, m_pred_pro):
    pred_y = np.where(abs(np.array(lnc_pred_pro) - 0.5) > abs(np.array(m_pred_pro) - 0.5), lnc_pred_y, m_pred_y)
    decision_values = np.where(abs(np.array(lnc_pred_pro) - 0.5) > abs(np.array(m_pred_pro) - 0.5), lnc_pred_pro, m_pred_pro)
    return pred_y, decision_values

def lnc_n_lnc_high_m(lnc_pred_y, lnc_pred_pro, m_pred_y, m_pred_pro):
    pred_y = np.where((abs(np.array(lnc_pred_pro) - 0.5) > abs(np.array(m_pred_pro) - 0.5)) | np.array(lnc_pred_y, dtype=bool), lnc_pred_y, m_pred_y)
    decision_values = np.where((abs(np.array(lnc_pred_pro) - 0.5) > abs(np.array(m_pred_pro) - 0.5)) | np.array(lnc_pred_y, dtype=bool), lnc_pred_pro, m_pred_pro)
    return pred_y, decision_values

In [3]:
def _perf_list(name, y, pred_y, dv):
    test_roc_score = metrics.roc_auc_score(y, dv)
    tn, fp, fn, tp = confusion_matrix(y, pred_y).ravel()

    acc = (tn + tp) / (tn + fp + fn + tp)
    recall = tp / (fn + tp)
    prec = (tp / (fp + tp))
    spec = (tn / (tn + fp))
    npv = (tn / (tn + fn))
    f1sc = (2 * (tp / (fn + tp)) * (tp / (fp + tp)) / ((tp / (fn + tp)) + (tp / (fp + tp))))
    return [name, test_roc_score, acc, recall, prec, spec, npv, f1sc]

In [None]:
lnc_train_x = np.load("data/merge_data/seq_rbp/pse_in_one2__rbp_10-3_-log10__train__x.npy")
lnc_train_y = np.load("data/merge_data/seq_rbp/pse_in_one2__rbp_10-3_-log10__train__y.npy")
lnc_test_x = np.load("data/merge_data/seq_rbp/pse_in_one2__rbp_10-3_-log10__test__x.npy")
lnc_test_y = np.load("data/merge_data/seq_rbp/pse_in_one2__rbp_10-3_-log10__test__y.npy")

m_train_x = np.load("data/merge_data/0916chiu/classifier/nuc_mRNA__cyto_mRNA_train.npy")
m_train_y = np.load("data/merge_data/0916chiu/classifier/nuc_mRNA__cyto_mRNA_train_y.npy")
m_test_x = np.load("data/merge_data/0916chiu/classifier/nuc_mRNA__cyto_mRNA_test.npy")
m_test_y = np.load("data/merge_data/0916chiu/classifier/nuc_mRNA__cyto_mRNA_test_y.npy")

In [None]:
data_path = "data/merge_data/0916chiu/classifier/"
file_str = "nuc_lncRNA__cyto_lncRNA"
test_x = np.load(data_path + file_str + "_test.npy")
test_y = np.load(data_path + file_str + "_test_y.npy")

with open("data/model/merge_data/0916chiu/classifier/nuc_lncRNA__cyto_lncRNA__esvm_s10_p10_e512_f10__v230106.pickle", 'rb') as f:
    lncrna_clf = pickle.load(f)

with open("data/model/merge_data/0916chiu/classifier/nuc_mRNA__cyto_mRNA__svm.pickle", 'rb') as f:
    mrna_clf = pickle.load(f)

lnc_pred_y, lnc_pred_pro = lncrna_clf.predict(test_x)
m_pred_y = mrna_clf.predict(test_x)

In [4]:
with open("data/model/merge_data/pse_in_one2__rbp_10-3_-log10__esvm_e512_p10_f10_s10_t1000_hasNu.pickle", 'rb') as f:
    lncrna_clf = pickle.load(f)
    
data_path = "data/merge_data/seq_rbp/"
file_str = "pse_in_one2__rbp_10-3_-log10"
test_x = np.load(data_path + file_str + "__test__x.npy")
test_y = np.load(data_path + file_str + "__test__y.npy")

In [11]:
# filter
with open("data/model/merge_data/pse_in_one2__rbp_10-3_-log10_filter__esvm_e512_p10_f10_s10_t1000_hasNu.pickle", 'rb') as f:
    lncrna_clf = pickle.load(f)
    
data_path = "data/merge_data/seq_rbp/"
file_str = "pse_in_one2__rbp_10-3_-log10_filter"
test_x = np.load(data_path + file_str + "__test__x.npy")
test_y = np.load(data_path + file_str + "__test__y.npy")

In [17]:
with open("data/model/merge_data/pse_in_one2__rbp_10-4_-log10__esvm_e1024_p10_f10_s10_t10000_hasNu.pickle", 'rb') as f:
    lncrna_clf = pickle.load(f)
    
data_path = "data/merge_data/seq_rbp/"
file_str = "pse_in_one2__rbp_10-4_-log10"
test_x = np.load(data_path + file_str + "__test__x.npy")
test_y = np.load(data_path + "pse_in_one2__rbp_10-3_-log10__test__y.npy")

In [5]:
with open("data/model/merge_data/0916chiu/classifier/nuc_mRNA__cyto_mRNA__svm.pickle", 'rb') as f:
    mrna_clf = pickle.load(f)

In [18]:
lncrna_perf_list = []
mrna_perf_list = []


# lncRNA, mRNA model pred
lnc_pred_y, lnc_pred_pro = lncrna_clf.predict(test_x)
m_pred_y = mrna_clf.predict(test_x[:, :196])
m_pred_pro = mrna_clf.predict_proba(test_x[:, :196])[:, 1]

# lncRNA
model_str = "lncRNA"
lncrna_perf_list.append(_perf_list(model_str, test_y, lnc_pred_y, lnc_pred_pro))

# lncRNA and mRNA
pred_y, decision_values = lnc_and_m(lnc_pred_y, lnc_pred_pro, m_pred_y, m_pred_pro)
lncrna_perf_list.append(_perf_list("lncRNA and mRNA", test_y, pred_y, decision_values))
# lncRNA or mRNA 
pred_y, decision_values = lnc_or_m(lnc_pred_y, lnc_pred_pro, m_pred_y, m_pred_pro)
lncrna_perf_list.append(_perf_list("lncRNA or mRNA", test_y, pred_y, decision_values))
# lncRNA high mRNA
pred_y, decision_values = lnc_high_m(lnc_pred_y, lnc_pred_pro, m_pred_y, m_pred_pro)
lncrna_perf_list.append(_perf_list("lncRNA high mRNA", test_y, pred_y, decision_values))
# lncRNA nuc, lncRNA high mRNA
pred_y, decision_values = lnc_n_lnc_high_m(lnc_pred_y, lnc_pred_pro, m_pred_y, m_pred_pro)
lncrna_perf_list.append(_perf_list("lncRNA nuc, lncRNA high mRNA", test_y, pred_y, decision_values))


In [19]:
lncrna_perf_df = pd.DataFrame(lncrna_perf_list, columns=["name", "lncRNA AUROC", "lncRNA acc", "lncRNA recall", "lncRNA prec", "lncRNA spec", "lncRNA npv", "lncRNA f1sc"])

In [20]:
lncrna_perf_df

Unnamed: 0,name,lncRNA AUROC,lncRNA acc,lncRNA recall,lncRNA prec,lncRNA spec,lncRNA npv,lncRNA f1sc
0,lncRNA,0.632387,0.659612,0.672549,0.929539,0.54386,0.156566,0.780432
1,lncRNA and mRNA,0.604575,0.363316,0.319608,0.920904,0.754386,0.110256,0.474527
2,lncRNA or mRNA,0.591348,0.72134,0.758824,0.917062,0.385965,0.151724,0.830472
3,lncRNA high mRNA,0.613416,0.643739,0.654902,0.927778,0.54386,0.149758,0.767816
4,"lncRNA nuc, lncRNA high mRNA",0.621723,0.698413,0.721569,0.926952,0.491228,0.164706,0.811466


In [15]:
# filter
lncrna_perf_df

Unnamed: 0,name,lncRNA AUROC,lncRNA acc,lncRNA recall,lncRNA prec,lncRNA spec,lncRNA npv,lncRNA f1sc
0,lncRNA,0.672599,0.597518,0.600406,0.907975,0.577465,0.172269,0.722833
1,lncRNA and mRNA,0.647673,0.35461,0.286004,0.921569,0.830986,0.143552,0.436533
2,lncRNA or mRNA,0.663857,0.698582,0.744422,0.892944,0.380282,0.176471,0.811947
3,lncRNA high mRNA,0.683327,0.632979,0.643002,0.91092,0.56338,0.185185,0.753864
4,"lncRNA nuc, lncRNA high mRNA",0.678513,0.656028,0.675456,0.907357,0.521127,0.187817,0.774419


In [21]:
lncrna_perf_df.to_csv("data/model/output/PseInOne2_RBP_10-4__lncRNA_mRNA__230410.csv", index=False)

In [None]:
lncrna_perf_df = pd.DataFrame(lncrna_perf_list, columns=["name", "lncRNA AUROC", "lncRNA acc", "lncRNA recall", "lncRNA prec", "lncRNA spec", "lncRNA npv", "lncRNA f1sc"])
mrna_perf_df = pd.DataFrame(mrna_perf_list, columns=["name", "mRNA AUROC", "mRNA acc", "mRNA recall", "mRNA prec", "mRNA spec", "mRNA npv", "mRNA f1sc"])

In [None]:
pd.merge(lncrna_perf_df, mrna_perf_df, on="name", how="inner").to_csv("data/model/output/test_lncRNA_mRNA__230306.csv", index=False)