# LightGBM with RNAlight hyperparameters
lncRNA.   
CoLab L4.   
Model = LG2 (optimized parameters)    
In RNAlight, 1=nuclear.    

In [1]:
import time
from datetime import datetime
print(datetime.now())

2024-06-04 14:52:20.778399


In [2]:
import os
import copy
import random
import collections
import itertools
import numpy as np
import pandas as pd
#import lightgbm as lgb
import warnings
from sklearn import svm
#from sklearn.externals import joblib
import joblib
#from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV
#import sklearn.metrics as metrics
# from sklearn.linear_model import LogisticRegression
import lightgbm
print('Done')

Done


In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
print('Done')

Done


In [4]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/RNAlight/'  # must end in "/"
    MODEL_DIR=PATH+'My Drive/data/Localization/RNAlight/'  # must end in "/"
    output_dir=PATH+'My Drive/data/Localization/RNAlight/'
except:
    IN_COLAB = False
    DATA_DIR = './'    # Mac
    MODEL_DIR = './'    # Mac
    output_dir = './'
print('DATA DIR', DATA_DIR)

Running on CoLab
Mounted at /content/drive/
DATA DIR /content/drive/My Drive/data/Localization/RNAlight/


In [5]:
SEED = 100
random.seed(SEED)
np.random.seed(SEED)
print(SEED)

100


In [6]:
def build_model():
    new_model = lightgbm.LGBMClassifier(
        boosting_type='gbdt',
        class_weight=None,
        colsample_bytree=0.5,
        importance_type='split',
        learning_rate=0.01,
        max_depth=40,
        min_child_samples=9,
        min_child_weight=0.001,
        min_split_gain=0.0,
        n_estimators=2200,
        n_jobs=1,
        num_leaves=35,
        objective='binary',
        random_state=100,
        reg_alpha=0.005,
        reg_lambda=0,
        subsample=0.6,
        subsample_for_bin=200000,
        subsample_freq=1,
        metric='binary_logloss'
    )
    return new_model

In [7]:
test = build_model()
print(test)
test = None

LGBMClassifier(colsample_bytree=0.5, learning_rate=0.01, max_depth=40,
               metric='binary_logloss', min_child_samples=9, n_estimators=2200,
               n_jobs=1, num_leaves=35, objective='binary', random_state=100,
               reg_alpha=0.005, reg_lambda=0, subsample=0.6, subsample_freq=1)


In [8]:
class stats_collector:
    def __init__(self):
        self.reset_statistics()
    def reset_statistics(self):
        self.cv_accuracy=[]
        self.cv_precision=[]
        self.cv_recall=[]
        self.cv_f1=[]
        self.cv_mcc=[]
        self.cv_auprc=[]
        self.cv_auroc=[]
    def _append_statistics(self,accuracy,precision,recall,f1,mcc,auprc,auroc):
        self.cv_accuracy.append(accuracy)
        self.cv_precision.append(precision)
        self.cv_recall.append(recall)
        self.cv_f1.append(f1)
        self.cv_mcc.append(mcc)
        self.cv_auprc.append(auprc)
        self.cv_auroc.append(auroc)
    def compute_performance(self,y_test,yhat_pred,yhat_classes,verbose=False):
        accuracy = accuracy_score(y_test, yhat_classes)*100.
        precision = precision_score(y_test, yhat_classes)*100.
        recall = recall_score(y_test, yhat_classes)*100.
        f1 = f1_score(y_test, yhat_classes)*100.
        mcc = matthews_corrcoef(y_test, yhat_classes)
        prc_Y, prc_X, prc_bins = precision_recall_curve(y_test, yhat_pred)
        auprc = auc(prc_X,prc_Y)*100.
        auroc = roc_auc_score(y_test, yhat_pred)*100.
        self._append_statistics(accuracy,precision,recall,f1,mcc,auprc,auroc)
        if verbose:
            self._show_confusion(y_test,yhat_pred,yhat_classes)
            self._show_statistics(accuracy,precision,recall,f1,mcc,auprc,auroc)
    def _show_confusion(self,y_test,yhat_pred,yhat_classes):
            print('Distrib of scores:',np.mean(yhat_pred),'mean',np.std(yhat_pred),'std')
            print('Range of scores:',np.min(yhat_pred),'to',np.max(yhat_pred))
            cm1 = confusion_matrix(y_test,yhat_classes)
            print('Confusion matrix\n',cm1)
            cm2 = confusion_matrix(y_test,yhat_classes,normalize='all')
            print('Normalized matrix\n',cm2)
    def _show_statistics(self,accuracy,precision,recall,f1,mcc,auprc,auroc):
            print('accuracy:',accuracy,'precision:',precision,'recall:',recall,\
                  'F1:',f1,'MCC:',mcc,'AUPRC:',auprc,'AUROC:',auroc)
    def _show_variance(self, name, stats_list):
        if name=='MCC':
            print('%10s %5.3f mean, %6.3f stdev' % (name,np.mean(stats_list),np.std(stats_list) ) )
        else:
            print('%10s %5.2f mean, %6.3f stdev' % (name,np.mean(stats_list),np.std(stats_list) ) )
        print(stats_list)
    def dump_all(self):
        self._show_variance('accuracy', self.cv_accuracy)
        self._show_variance('precision',self.cv_precision)
        self._show_variance('recall',   self.cv_recall)
        self._show_variance('F1',       self.cv_f1)
        self._show_variance('MCC',      self.cv_mcc)
        self._show_variance('AUPRC',    self.cv_auprc)
        self._show_variance('AUROC',    self.cv_auroc)

In [9]:
# From RNAlight notebook
def _count_kmer(Dataset,k): # k = 3,4,5

    # copy dataset
    dataset = copy.deepcopy(Dataset)
    # alphbet of nucleotide
    nucleotide = ['A','C','G','T']

    # generate k-mers
    #  k == 5:
    five = list(itertools.product(nucleotide,repeat=5))
    pentamer = []
    for n in five:
        pentamer.append("".join(n))

    #  k == 4:
    four = list(itertools.product(nucleotide,repeat=4))
    tetramer = []
    for n in four:
        tetramer.append("".join(n))

    # k == 3:
    three = list(itertools.product(nucleotide,repeat=3))
    threemer = []
    for n in three:
        threemer.append("".join(n))

    # input features can be combinations of diffrent k values
    if k == 34:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
    if k == 45:
        table_kmer = dict.fromkeys(tetramer,0)
        table_kmer.update(dict.fromkeys(pentamer,0))
    if k == 345:
        table_kmer = dict.fromkeys(threemer,0)
        table_kmer.update(dict.fromkeys(tetramer,0))
        table_kmer.update(dict.fromkeys(pentamer,0))

    # count k-mer for each sequence
    for mer in table_kmer.keys():
        table_kmer[mer] = dataset["cdna"].apply(lambda x : x.count(mer))

    # for k-mer raw count without normalization, index: nuc:1 or cyto:0
    rawcount_kmer_df = pd.DataFrame(table_kmer)
    df1_rawcount = pd.concat([rawcount_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    df1_rawcount.index = dataset["tag"]

    # for k-mer frequency with normalization , index: nuc:1 or cyto:0
    freq_kmer_df = rawcount_kmer_df.apply(lambda x: x/x.sum(),axis=1)
    df1 = pd.concat([freq_kmer_df,dataset["ensembl_transcript_id"]],axis = 1)
    df1.index = dataset["tag"]

    return df1  # ,df1_rawcount

In [10]:
# From RNAlight notebook
def load_dataframe(cyto_f,nuc_f):
    print('load dataframe')
    dataset_cyto = pd.read_csv(cyto_f,sep='\t',index_col = False)    #1806
    dataset_nuc = pd.read_csv(nuc_f,sep='\t',index_col = False)    #1986
    print( len(dataset_cyto), 'cytoplasmic samples')
    print( len(dataset_nuc),  'nuclear samples')
    return dataset_cyto,dataset_nuc

In [11]:
# Added
def rebalance(dataset_cyto,dataset_nuc):
    print('sample down to balance classes')
    min_size = min(len(dataset_cyto),len(dataset_nuc))
    # random sampling without replacement
    dataset_cyto = dataset_cyto.sample(min_size, random_state=SEED)
    dataset_nuc  = dataset_nuc.sample(min_size,  random_state=SEED)
    print( len(dataset_cyto), 'cytoplasmic samples')
    print( len(dataset_nuc),  'nuclear samples')
    return dataset_cyto,dataset_nuc

In [12]:
# From RNAlight notebook
def extract_features_and_split(dataset_cyto,dataset_nuc):
    print('add labels, concatenate')
    # Set the tag of RCI(log2FC): nuclear 1 / cytosol 0
    dataset_nuc['tag'] = 1;dataset_cyto['tag'] = 0
    # merge the nuc and cyto dataset
    dataset = pd.concat([dataset_nuc,dataset_cyto]) # 3792

    print('dedupe (probably not necessary)')
    # remove duplications(actually,each lncRNA is unique in its class)
    dataset.drop_duplicates(keep="first",subset=["ensembl_transcript_id","name","cdna"],inplace=True) # 3792

    print('count kmers')
    # k = 3,4,5 count the normalized and raw count of kmer
    df_kmer_345 = _count_kmer(dataset,345)   # df_kmer_345,df_kmer_345_rawcount =

    # We commented this out. No need to save the tsv.
    # df_kmer_345.to_csv(os.path.join(output_dir,"df_kmer345_freq.tsv"),sep='\t')
    # This was commented out in the original. Seems they reran using saved kmers. Should test if file exists.
    # load kmer file
    # df_kmer_345 = pd.read_csv(os.path.join(output_dir,"df_kmer345_freq.tsv"),sep='\t',index_col= 0)

    # convert to x:kmer-freq , y:label
    del df_kmer_345['ensembl_transcript_id']
    x_kmer = df_kmer_345.values
    y_kmer = y_kmer = np.array(df_kmer_345.index)

    # split into training and test sets (9:1)
    print('train/test split')
    x_train, x_test, y_train, y_test = train_test_split(x_kmer, y_kmer, test_size = 0.1, random_state = SEED)

    #print('Apply cross-validation to all the data (no test set withheld)')
    #x_train, x_test, y_train, y_test = train_test_split(x_kmer, y_kmer, test_size = None, random_state = SEED)
    print('train set shape',x_train.shape)
    # Added
    labels,counts = np.unique(y_train,return_counts=True)
    print('train set labels', labels, 'counts',counts)
    labels,counts = np.unique(y_test,return_counts=True)
    print('test set labels', labels, 'counts',counts)
    return x_train, x_test, y_train, y_test

In [13]:
def do_cv(x_train, y_train):
    stats = stats_collector()
    for round in range(1,3):
        fold=0
        splitter = KFold(n_splits=5)
        for train_index, valid_index in splitter.split(x_train):
            fold += 1
            print('Round', round, 'Fold', fold)
            print('Num samples in train and valid sets:', len(train_index), len(valid_index))
            print('Train')
            lgb = build_model()
            history = lgb.fit(x_train[train_index], y_train[train_index])
            print('Validate')
            x_valid = x_train[valid_index]
            y_valid = y_train[valid_index]
            yhat_classes= lgb.predict(x_valid)  # get 0 or 1
            yhat_pairs=   lgb.predict_proba(x_valid)  # get [ prob of 0, prob of 1 ]
            yhat_pred=    [pair[1] for pair in yhat_pairs]
            stats.compute_performance(y_valid,yhat_pred,yhat_classes,verbose=False)
    return stats

In [14]:
if True:
    print('Use the RNAlight training set (already has middle excluded)')
    cyto_f = DATA_DIR+'02_lncRNA_info_cyto_transcript.tsv'
    nuc_f  = DATA_DIR+'02_lncRNA_info_nuc_transcript.tsv'
    print('Data files:')
    print(cyto_f,'\n',nuc_f)
    dataset_cyto,dataset_nuc = load_dataframe(cyto_f,nuc_f)
    x_train, x_test, y_train, y_test = extract_features_and_split(dataset_cyto,dataset_nuc)
    stats = do_cv(x_train, y_train)
    print('\nCross validation results')
    stats.dump_all()

Use the RNAlight training set (already has middle excluded)
Data files:
/content/drive/My Drive/data/Localization/RNAlight/02_lncRNA_info_cyto_transcript.tsv 
 /content/drive/My Drive/data/Localization/RNAlight/02_lncRNA_info_nuc_transcript.tsv
load dataframe
1806 cytoplasmic samples
1986 nuclear samples
add labels, concatenate
dedupe (probably not necessary)
count kmers
train/test split
train set shape (3412, 1344)
train set labels [0 1] counts [1622 1790]
test set labels [0 1] counts [184 196]
Round 1 Fold 1
Num samples in train and valid sets: 2729 683
Train
[LightGBM] [Info] Number of positive: 1419, number of negative: 1310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.086401 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 335916
[LightGBM] [Info] Number of data points in the train set: 2729, number of used features: 1344
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.519971 -> initscore=0

In [15]:
if True:
    print('Rebalance (sample down the majority class) and repeat')
    dataset_cyto,dataset_nuc = rebalance(dataset_cyto,dataset_nuc)
    x_train, x_test, y_train, y_test = extract_features_and_split(dataset_cyto,dataset_nuc)
    stats = do_cv(x_train, y_train)
    print('\nCross validation results')
    stats.dump_all()

Rebalance (sample down the majority class) and repeat
sample down to balance classes
1806 cytoplasmic samples
1806 nuclear samples
add labels, concatenate
dedupe (probably not necessary)
count kmers
train/test split
train set shape (3250, 1344)
train set labels [0 1] counts [1632 1618]
test set labels [0 1] counts [174 188]
Round 1 Fold 1
Num samples in train and valid sets: 2600 650
Train
[LightGBM] [Info] Number of positive: 1286, number of negative: 1314
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.078873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 334586
[LightGBM] [Info] Number of data points in the train set: 2600, number of used features: 1344
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494615 -> initscore=-0.021539
[LightGBM] [Info] Start training from score -0.021539
Validate
Round 1 Fold 2
Num samples in train and valid sets: 2600 650
Train
[LightGBM] [Info] Number of positive

In [16]:
if True:
    print('Use our lncATLAS training set "middle exclusion" (CN-RCI < -2 or CN-RCI > 0)')
    cyto_f = DATA_DIR+'ForRNAlight.lncRNA_RCIgt0.canonical.tsv'
    nuc_f  = DATA_DIR+'ForRNAlight.lncRNA_RCIlt-2.canonical.tsv'
    print('Data files:')
    print(cyto_f,'\n',nuc_f)
    dataset_cyto,dataset_nuc = load_dataframe(cyto_f,nuc_f)
    x_train, x_test, y_train, y_test = extract_features_and_split(dataset_cyto,dataset_nuc)
    stats = do_cv(x_train, y_train)
    print('\nCross validation results')
    stats.dump_all()

Use our lncATLAS training set "middle exclusion" (CN-RCI < -2 or CN-RCI > 0)
Data files:
/content/drive/My Drive/data/Localization/RNAlight/ForRNAlight.lncRNA_RCIgt0.canonical.tsv 
 /content/drive/My Drive/data/Localization/RNAlight/ForRNAlight.lncRNA_RCIlt-2.canonical.tsv
load dataframe
1703 cytoplasmic samples
706 nuclear samples
add labels, concatenate
dedupe (probably not necessary)
count kmers
train/test split
train set shape (2168, 1344)
train set labels [0 1] counts [1524  644]
test set labels [0 1] counts [179  62]
Round 1 Fold 1
Num samples in train and valid sets: 1734 434
Train
[LightGBM] [Info] Number of positive: 491, number of negative: 1243
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326857
[LightGBM] [Info] Number of data points in the train set: 1734, number of used features: 1344
[LightGBM] [Info] [binary:BoostFromScore]: p

In [17]:
if True:   # just to be sure
    print('Rebalance (sample down the majority class) and repeat')
    dataset_cyto,dataset_nuc = rebalance(dataset_cyto,dataset_nuc)
    x_train, x_test, y_train, y_test = extract_features_and_split(dataset_cyto,dataset_nuc)
    stats = do_cv(x_train, y_train)
    print('\nCross validation results')
    stats.dump_all()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Validate
Round 2 Fold 1
Num samples in train and valid sets: 1016 254
Train
[LightGBM] [Info] Number of positive: 500, number of negative: 516
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030511 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 281969
[LightGBM] [Info] Number of data points in the train set: 1016, number of used features: 1344
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492126 -> initscore=-0.031499
[LightGBM] [Info] Start training from score -0.031499
Validate
Round 2 Fold 2
Num samples in train and valid sets: 1016 254
Train
[LightGBM] [Info] Number of positive: 508, number of negative: 508
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 284020
[LightGBM] [Info] Number 

In [18]:
if True:
    print('Use our lncATLAS training set "all" (-inf <= CN-RCI <= +inf)')
    cyto_f = DATA_DIR+'ForRNAlight.lncRNA_RCIgt-1.canonical.tsv'
    nuc_f  = DATA_DIR+'ForRNAlight.lncRNA_RCIlt-1.canonical.tsv'
    print(cyto_f,'\n',nuc_f)
    dataset_cyto,dataset_nuc = load_dataframe(cyto_f,nuc_f)
    x_train, x_test, y_train, y_test = extract_features_and_split(dataset_cyto,dataset_nuc)
    stats = do_cv(x_train, y_train)
    print('\nCross validation results')
    stats.dump_all()

Use our lncATLAS training set "all" (-inf <= CN-RCI <= +inf)
/content/drive/My Drive/data/Localization/RNAlight/ForRNAlight.lncRNA_RCIgt-1.canonical.tsv 
 /content/drive/My Drive/data/Localization/RNAlight/ForRNAlight.lncRNA_RCIlt-1.canonical.tsv
load dataframe
2887 cytoplasmic samples
1548 nuclear samples
add labels, concatenate
dedupe (probably not necessary)
count kmers
train/test split
train set shape (3991, 1344)
train set labels [0 1] counts [2606 1385]
test set labels [0 1] counts [281 163]
Round 1 Fold 1
Num samples in train and valid sets: 3192 799
Train
[LightGBM] [Info] Number of positive: 1102, number of negative: 2090
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100901 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 340977
[LightGBM] [Info] Number of data points in the train set: 3192, number of used features: 1344
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.345238 -> initscore

In [19]:
if True:
    print('Rebalance (sample down the majority class) and repeat')
    dataset_cyto,dataset_nuc = rebalance(dataset_cyto,dataset_nuc)
    x_train, x_test, y_train, y_test = extract_features_and_split(dataset_cyto,dataset_nuc)
    stats = do_cv(x_train, y_train)
    print('\nCross validation results')
    stats.dump_all()

Rebalance (sample down the majority class) and repeat
sample down to balance classes
1548 cytoplasmic samples
1548 nuclear samples
add labels, concatenate
dedupe (probably not necessary)
count kmers
train/test split
train set shape (2786, 1344)
train set labels [0 1] counts [1397 1389]
test set labels [0 1] counts [151 159]
Round 1 Fold 1
Num samples in train and valid sets: 2228 558
Train
[LightGBM] [Info] Number of positive: 1096, number of negative: 1132
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072786 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 335379
[LightGBM] [Info] Number of data points in the train set: 2228, number of used features: 1344
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491921 -> initscore=-0.032319
[LightGBM] [Info] Start training from score -0.032319
Validate
Round 1 Fold 2
Num samples in train and valid sets: 2229 557
Train
[LightGBM] [Info] Number of positive

In [20]:
print(datetime.now())
print('Done')

2024-06-04 19:00:25.074157
Done
