# Benchmark
Adopt the RNAlight code, mostly.

Input files: one for cytoplasmic lncRNA, the other for nuclear lncRNA.   
Input file format: tab-delimited lines of 3 fields: transcipt ID, gene name, RNA sequence.    
Header line: ensembl_transcript_id name cdna 
Data lines: ENST00000371086	DLEU2L	GAAAGTTTTCACTGCATCT... 
Each lncRNA is placed in either file, depending on mean CNRCI over 14 cell lines from lncATLAS.   
The threshold is zero; positive CNRCI values are cytoplasmic and others are nuclear.    
Use the Ensembl transcript ID (prefix ENST) without any version number suffix.
Use the canonical RNA sequence from GenCode to represent each gene in lncATLAS.
Evaluate the model by cross-validation on the entire dataset (no test subset withheld).

In [1]:
import time
from datetime import datetime
print(datetime.now())

2024-03-24 19:36:23.929517


In [2]:
import os
import copy
import random
import collections
import itertools
import numpy as np
import pandas as pd
import warnings
from sklearn import svm
import joblib
from sklearn.model_selection import train_test_split,RandomizedSearchCV
import lightgbm
print('Done')

Done


In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
print('Done')

Done


In [4]:
DATA_DIR = './'    
MODEL_DIR = './'    
output_dir = './'
print('DATA DIR', DATA_DIR)

DATA DIR ./


In [5]:
SEED = 100
random.seed(SEED)
np.random.seed(SEED)
print(SEED)

100


In [6]:
def build_model():
    new_model = lightgbm.LGBMClassifier()
    return new_model

In [7]:
test = build_model()
print(test)
test = None

LGBMClassifier()


In [8]:
class stats_collector:
    def __init__(self):
        self.reset_statistics()
    def reset_statistics(self):
        self.cv_accuracy=[]
        self.cv_precision=[]
        self.cv_recall=[]
        self.cv_f1=[]
        self.cv_mcc=[]
        self.cv_auprc=[]
        self.cv_auroc=[]
    def _append_statistics(self,accuracy,precision,recall,f1,mcc,auprc,auroc):
        self.cv_accuracy.append(accuracy)
        self.cv_precision.append(precision)
        self.cv_recall.append(recall)
        self.cv_f1.append(f1)
        self.cv_mcc.append(mcc)
        self.cv_auprc.append(auprc)
        self.cv_auroc.append(auroc)
    def compute_performance(self,y_test,yhat_pred,yhat_classes,verbose=False):
        accuracy = accuracy_score(y_test, yhat_classes)*100.
        precision = precision_score(y_test, yhat_classes)*100.
        recall = recall_score(y_test, yhat_classes)*100.
        f1 = f1_score(y_test, yhat_classes)*100.
        mcc = matthews_corrcoef(y_test, yhat_classes)
        prc_Y, prc_X, prc_bins = precision_recall_curve(y_test, yhat_pred)
        auprc = auc(prc_X,prc_Y)*100.
        auroc = roc_auc_score(y_test, yhat_pred)*100.
        self._append_statistics(accuracy,precision,recall,f1,mcc,auprc,auroc)
        if verbose:
            self._show_confusion(y_test,yhat_pred,yhat_classes)
            self._show_statistics(accuracy,precision,recall,f1,mcc,auprc,auroc)
    def _show_confusion(self,y_test,yhat_pred,yhat_classes):
            print('Distrib of scores:',np.mean(yhat_pred),'mean',np.std(yhat_pred),'std')
            print('Range of scores:',np.min(yhat_pred),'to',np.max(yhat_pred))
            cm1 = confusion_matrix(y_test,yhat_classes)
            print('Confusion matrix\n',cm1)
            cm2 = confusion_matrix(y_test,yhat_classes,normalize='all')
            print('Normalized matrix\n',cm2)
    def _show_statistics(self,accuracy,precision,recall,f1,mcc,auprc,auroc):
            print('accuracy:',accuracy,'precision:',precision,'recall:',recall,\
                  'F1:',f1,'MCC:',mcc,'AUPRC:',auprc,'AUROC:',auroc)
    def _show_variance(self, name, stats_list):
        if name=='MCC':
            print('%10s %5.3f mean, %6.3f stdev' % (name,np.mean(stats_list),np.std(stats_list) ) )
        else:
            print('%10s %5.2f mean, %6.3f stdev' % (name,np.mean(stats_list),np.std(stats_list) ) )
        print(stats_list)
    def dump_all(self):
        self._show_variance('accuracy', self.cv_accuracy)
        self._show_variance('precision',self.cv_precision)
        self._show_variance('recall',   self.cv_recall)
        self._show_variance('F1',       self.cv_f1)
        self._show_variance('MCC',      self.cv_mcc)
        self._show_variance('AUPRC',    self.cv_auprc)
        self._show_variance('AUROC',    self.cv_auroc)

In [9]:
# https://stackoverflow.com/questions/2970520/string-count-with-overlapping-occurrences
# Count matching substrings including overlapping ones
def occurrences(string, sub):
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count+=1
        else:
            return count

In [10]:
rna='AAAAA'
mer='AAA'
print(rna.count(mer))
print(occurrences('AAAAA','AAA'))

1
3


In [11]:
# From RNAlight notebook
def _count_kmer(Dataset,k): # k = 3,4,5

    # copy dataset
    dataset = copy.deepcopy(Dataset)
    # alphbet of nucleotide
    nucleotide = ['A','C','G','T']

    # generate k-mers
    #  k == 5:
    five = list(itertools.product(nucleotide,repeat=5))
    pentamer = []
    for n in five:
        pentamer.append("".join(n))

    #  k == 4:
    four = list(itertools.product(nucleotide,repeat=4))
    tetramer = []
    for n in four:
        tetramer.append("".join(n))

    # k == 3:
    three = list(itertools.product(nucleotide,repeat=3))
    threemer = []
    for n in three:
        threemer.append("".join(n))

    # input features can be combinations of diffrent k values
    if k == 34:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
    if k == 45:
        table_kmer = dict.fromkeys(tetramer,0)
        table_kmer.update(dict.fromkeys(pentamer,0))
    if k == 345:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
        table_kmer.update(dict.fromkeys(pentamer,0))

    # count k-mer for each sequence
    for mer in table_kmer.keys():
        #table_kmer[mer] = dataset["cdna"].apply(lambda x : x.count(mer))
        table_kmer[mer] = dataset["cdna"].apply(lambda x : occurrences(x,mer))

    # for k-mer raw count without normalization, index: nuc:1 or cyto:0
    rawcount_kmer_df = pd.DataFrame(table_kmer)
    df1_rawcount = pd.concat([rawcount_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    df1_rawcount.index = dataset["tag"]

    # for k-mer frequency with normalization , index: nuc:1 or cyto:0
    freq_kmer_df = rawcount_kmer_df.apply(lambda x: x/x.sum(),axis=1)
    df1 = pd.concat([freq_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    df1.index = dataset["tag"]

    return df1  # ,df1_rawcount

In [12]:
# From RNAlight notebook
def load_dataframe(cyto_f,nuc_f):
    print('load dataframe')
    dataset_cyto = pd.read_csv(cyto_f,sep='\t',index_col = False)    #1806
    dataset_nuc = pd.read_csv(nuc_f,sep='\t',index_col = False)    #1986
    print( len(dataset_cyto), 'cytoplasmic samples')
    print( len(dataset_nuc),  'nuclear samples')
    return dataset_cyto,dataset_nuc

In [13]:
# Added
def rebalance(dataset_cyto,dataset_nuc):
    print('sample down to balance classes')
    min_size = min(len(dataset_cyto),len(dataset_nuc))
    # random sampling without replacement
    dataset_cyto = dataset_cyto.sample(min_size, random_state=SEED)
    dataset_nuc  = dataset_nuc.sample(min_size,  random_state=SEED)
    print( len(dataset_cyto), 'cytoplasmic samples')
    print( len(dataset_nuc),  'nuclear samples')
    return dataset_cyto,dataset_nuc

In [15]:
# From RNAlight notebook
def extract_features_and_split(dataset_cyto,dataset_nuc):
    print('add labels, concatenate')
    # Set the tag of RCI(log2FC): nuclear 1 / cytosol 0
    dataset_nuc['tag'] = 1;dataset_cyto['tag'] = 0
    # merge the nuc and cyto dataset
    dataset = pd.concat([dataset_nuc,dataset_cyto]) # 3792

    print('dedupe (probably not necessary)')
    # remove duplications(actually,each lncRNA is unique in its class)
    dataset.drop_duplicates(keep="first",subset=["ensembl_transcript_id","name","cdna"],inplace=True) # 3792

    print('count kmers')
    # k = 3,4,5 count the normalized and raw count of kmer
    df_kmer_345 = _count_kmer(dataset,345)   # df_kmer_345,df_kmer_345_rawcount =

    # We commented this out. No need to save the tsv.
    # df_kmer_345.to_csv(os.path.join(output_dir,"df_kmer345_freq.tsv"),sep='\t')
    # This was commented out in the original. Seems they reran using saved kmers. Should test if file exists.
    # load kmer file
    # df_kmer_345 = pd.read_csv(os.path.join(output_dir,"df_kmer345_freq.tsv"),sep='\t',index_col= 0)

    # convert to x:kmer-freq , y:label
    del df_kmer_345['ensembl_transcript_id']
    x_kmer = df_kmer_345.values
    y_kmer = y_kmer = np.array(df_kmer_345.index)

    # split into training and test sets (9:1)
    #print('train/test split')
    #x_train, x_test, y_train, y_test = train_test_split(x_kmer, y_kmer, test_size = 0.1, random_state = SEED)

    # Use all the data
    print('Apply cross-validation to all the data (no test set withheld)')
    x_train, x_test, y_train, y_test = train_test_split(x_kmer, y_kmer, test_size = None, random_state = SEED)

    print('train set shape',x_train.shape)
    # Added
    labels,counts = np.unique(y_train,return_counts=True)
    print('train set labels', labels, 'counts',counts)
    labels,counts = np.unique(y_test,return_counts=True)
    print('test set labels', labels, 'counts',counts)
    return x_train, x_test, y_train, y_test

In [16]:
def do_cv(x_train, y_train):
    stats = stats_collector()
    for round in range(1,3):
        fold=0
        splitter = KFold(n_splits=5)
        for train_index, valid_index in splitter.split(x_train):
            fold += 1
            print('Round', round, 'Fold', fold)
            print('Num samples in train and valid sets:', len(train_index), len(valid_index))
            print('Train')
            lgb = build_model()
            history = lgb.fit(x_train[train_index], y_train[train_index])
            print('Validate')
            x_valid = x_train[valid_index]
            y_valid = y_train[valid_index]
            yhat_classes= lgb.predict(x_valid)  # get 0 or 1
            yhat_pairs=   lgb.predict_proba(x_valid)  # get [ prob of 0, prob of 1 ]
            yhat_pred=    [pair[1] for pair in yhat_pairs]
            stats.compute_performance(y_valid,yhat_pred,yhat_classes,verbose=False)
    return stats

# Cross-Validation

In [17]:
print(datetime.now())
print('Use our lncATLAS training set')
cyt_file  = DATA_DIR+'mean_RCI_positive.canonical.tsv'   
nuc_file  = DATA_DIR+'mean_RCI_negative.canonical.tsv'
print(cyt_file,'\n',nuc_file)
dataset_cyto,dataset_nuc = load_dataframe(cyt_file,nuc_file)
print(datetime.now())

2024-03-24 19:46:23.819294
Use our lncATLAS training set
./mean_RCI_positive.canonical.tsv 
 ./mean_RCI_negative.canonical.tsv
load dataframe
1701 cytoplasmic samples
2835 nuclear samples
2024-03-24 19:46:23.996776


In [18]:
if True:
    print(datetime.now())
    x_train, x_test, y_train, y_test = extract_features_and_split(dataset_cyto,dataset_nuc)
    print(datetime.now())
    stats = do_cv(x_train, y_train)
    print(datetime.now())
    print('\nCross validation results')
    stats.dump_all()

2024-03-24 19:46:24.612879
add labels, concatenate
dedupe (probably not necessary)
count kmers
Apply cross-validation to all the data (no test set withheld)
train set shape (3402, 1344)
train set labels [0 1] counts [1272 2130]
test set labels [0 1] counts [429 705]
2024-03-24 19:47:14.651765
Round 1 Fold 1
Num samples in train and valid sets: 2721 681
Train
[LightGBM] [Info] Number of positive: 1677, number of negative: 1044
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.096581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339187
[LightGBM] [Info] Number of data points in the train set: 2721, number of used features: 1344
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.616318 -> initscore=0.473947
[LightGBM] [Info] Start training from score 0.473947
Validate
Round 1 Fold 2
Num samples in train and valid sets: 2721 681
Train
[LightGBM] [Info] Number of positive: 1715, number of negative: 1006
[

In [19]:
if True:
    print(datetime.now())
    print('Rebalance (sample down the majority class) and repeat')
    dataset_cyto,dataset_nuc = rebalance(dataset_cyto,dataset_nuc)
    print(datetime.now())
    x_train, x_test, y_train, y_test = extract_features_and_split(dataset_cyto,dataset_nuc)
    print(datetime.now())
    stats = do_cv(x_train, y_train)
    print(datetime.now())
    print('\nCross validation results')
    stats.dump_all()

2024-03-24 19:49:34.635057
Rebalance (sample down the majority class) and repeat
sample down to balance classes
1701 cytoplasmic samples
1701 nuclear samples
2024-03-24 19:49:34.668199
add labels, concatenate
dedupe (probably not necessary)
count kmers
Apply cross-validation to all the data (no test set withheld)
train set shape (2551, 1344)
train set labels [0 1] counts [1294 1257]
test set labels [0 1] counts [407 444]
2024-03-24 19:50:11.739582
Round 1 Fold 1
Num samples in train and valid sets: 2040 511
Train
[LightGBM] [Info] Number of positive: 1000, number of negative: 1040
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 332604
[LightGBM] [Info] Number of data points in the train set: 2040, number of used features: 1344
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.490196 -> initscore=-0.039221
[LightGBM] [Info] Start training from sc

In [20]:
print(datetime.now())
print('Done')

2024-03-24 19:52:09.986689
Done
