# Analyze lncATLAS CNRCI values
Count genes per cell line.   
Count genes given several thresholds.   
Meausure correlations between cell lines.  

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import scipy.stats as ss
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2024-03-15 12:52:24.895045
Python 3.11.6
sklearn 1.3.2


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/MiddleExclusion/'  # must end in "/"
except:
    print('Running on desktop')
    IN_COLAB = False
    DATA_DIR = './'  
print(DATA_DIR)

Running on desktop
./


In [3]:
RCI_FILE = 'lncATLAS_all_data_RCI.csv'
FILEPATH = DATA_DIR + RCI_FILE
RATIO_TYPE = 'CNRCI'  # ignore other compartmental ratios, keep cyto-to-nuc
CELL_LINES  = 15
EXCLUDE = []

## CNRCI values per gene
We can only measure correlations on genes with CNRCI measured in multiple cell lines.   
To explore the data, we build a table of number of genes with n or more CNRCI values, for n = 0, 1, 2, 3, ...

In [4]:
def values_per_gene(coding=True):
    '''
    Parse the data file downloaded from lncATLAS.
    Each line specifies gene, gene type, cell line, and RCI.
    Each RCI is a ratio of abundances between compartments.
    Filter for just one RCI type, typically CNRCI i.e. cytoplasmic-to-nuclear RCI.
    For each gene, count the CNRCI values, excluding "NA" (not available).
    Assume:
    - FILEPATH names the csv file obtained from lncATLAS.
    - RATIO_TYPE is set to 'CNRCI'.
    - Each line for same gene represents a different cell line.
    - Each line includes type 'coding' or 'nc' to indicate the gene type.
    Return a mapping of gene ID to count.
    '''
    all_counts=dict()
    if coding:
        coding_type = 'coding'
    else:
        coding_type = 'nc'
    with open (FILEPATH,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
                continue
            count_rci = 0
            fields = row.strip().split(',')
            if fields[5]!=coding_type or fields[2]!=RATIO_TYPE or fields[3]=='NA':
                continue
            gene_id = fields[0]
            cell_line = fields[1]
            cnrci = float(fields[3])
            if gene_id not in all_counts:
                all_counts[gene_id] = 0
            all_counts[gene_id] += 1
    return all_counts

In [5]:
nc_counts = values_per_gene(coding=False)
print('Long non-coding genes with a CNRCI value in at least one cell line:',len(nc_counts.keys()))

Long non-coding genes with a CNRCI value in at least one cell line: 6768


In [6]:
def analyze_values_per_gene(all_counts):
    '''
    Input: a mapping of gene ID to count.
    Prints a table of genes with at least n values, for n = 0, 1, 2, 3, ...
    '''
    all_values = list(all_counts.values())
    all_values = np.array(all_values)
    print(len(all_values), 'genes examined')
    print(np.mean(all_values), 'average number of CNRCI for one gene')
    print(min(all_values), max(all_values), 'min and max CNRCI for any gene')
    print()
    print('Threshold T, Genes with T CNRCI, Genes with more CNRCI, Genes with fewer CNRCI')
    for threshold in range(CELL_LINES+1):
        equal = np.count_nonzero(all_values==threshold)
        more  = np.count_nonzero(all_values>threshold)
        less  = np.count_nonzero(all_values<threshold)
        print("%2d %6d %6d %6d" % (threshold, equal, more, less))

In [7]:
print('Long non-coding genes per number of cell lines.')
analyze_values_per_gene(nc_counts)

Long non-coding genes per number of cell lines.
6768 genes examined
4.169178486997636 average number of CNRCI for one gene
1 15 min and max CNRCI for any gene

Threshold T, Genes with T CNRCI, Genes with more CNRCI, Genes with fewer CNRCI
 0      0   6768      0
 1   2113   4655      0
 2   1037   3618   2113
 3    707   2911   3150
 4    523   2388   3857
 5    447   1941   4380
 6    357   1584   4827
 7    311   1273   5184
 8    274    999   5495
 9    216    783   5769
10    233    550   5985
11    172    378   6218
12    152    226   6390
13    114    112   6542
14     81     31   6656
15     31      0   6737


## Values per cell line
We would not trust correlations if each cell line had just a few gene CNRCI values.     
To explore the data, count gene CNRCI values per cell line.

In [8]:
def values_per_cell_line(coding=True):
    '''
    Parse the data file downloaded from lncATLAS.
    Each line specifies gene, gene type, cell line, and RCI.
    Each RCI is a ratio of abundances between compartments.
    Filter for just one RCI type, typically CNRCI i.e. cytoplasmic-to-nuclear RCI.
    For each cell line, count the CNRCI values, excluding "NA" (not available).
    Assume:
    - FILEPATH names the csv file obtained from lncATLAS.
    - RATIO_TYPE is set to 'CNRCI'.
    - Each line for same cell line represents a different gene.
    - Each line includes type 'coding' or 'nc' to indicate the gene type.
    Return a mapping of cell line name to lists of CNRCI values.    
    '''
    all_counts=dict()
    if coding:
        coding_type = 'coding'
    else:
        coding_type = 'nc'
    with open (FILEPATH,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
                continue
            count_rci = 0
            fields = row.strip().split(',')
            if fields[5]!=coding_type or fields[2]!=RATIO_TYPE or fields[3]=='NA':
                continue
            gene_id = fields[0]
            cell_line = fields[1]
            cnrci = float(fields[3])
            if cell_line not in all_counts:
                all_counts[cell_line] = []
            all_counts[cell_line].append(cnrci)
    return all_counts

In [9]:
nc_lines = values_per_cell_line(coding=False)
print('Cell lines with at least one long non-coding CNRCI value:', len(nc_lines))

Cell lines with at least one long non-coding CNRCI value: 15


In [10]:
def analyze_values_per_cell_line(all_lines):
    '''
    Input: a mapping of cell line to count.
    Prints a table of cell lines and their CNRCI statistics.
    '''
    print('Cell line, count, mean, stdev, min, max')
    for cell_line in all_lines.keys():
        list_of_cnrci = all_lines[cell_line]
        ary = np.array(list_of_cnrci)
        print ("{0:10s} {1:5d} {2:5.2f} {3:5.2f} {4:5.2f} {5:5.2f}".format( 
            cell_line, len(ary), np.mean(ary), np.std(ary), np.min(ary), np.max(ary)))

In [11]:
print('Long non-coding CNRCI values per cell line')
analyze_values_per_cell_line(nc_lines)

Long non-coding CNRCI values per cell line
Cell line, count, mean, stdev, min, max
MCF.7       3114 -1.60  2.18 -9.18  4.17
A549        2129 -0.56  1.64 -5.60  4.49
GM12878     2511 -1.19  1.62 -8.32  3.36
H1.hESC     4923 -0.46  1.56 -6.85  5.58
HeLa.S3     1317 -1.57  1.88 -7.57  4.15
HepG2       2014 -1.45  1.98 -7.57  3.65
HT1080      1361 -0.61  1.86 -7.74  4.25
HUVEC       2214 -1.43  2.06 -8.23  3.64
IMR.90       582 -0.84  2.17 -7.89  3.69
K562        1402 -1.01  1.68 -6.67  4.36
NCI.H460     930 -1.53  1.89 -7.61  4.58
NHEK        1588 -1.24  1.99 -8.40  3.47
SK.MEL.5     808 -1.80  2.21 -10.26  3.97
SK.N.DZ      907 -0.62  1.57 -8.33  3.70
SK.N.SH     2417 -1.40  2.14 -9.54  4.05


## Genes between thresholds
Prior studies applied a middle excludion filter to lncATLAS data.   
That is, they excluded genes whose CNRCI was borderline.  
To explore the filter's effects, count genes per cell line after filtering.    
Explore the lncLocator2 filter (excludes CNRCI between -1 and +1) applied to each gene in each cell line.      
Also explore the RNAlight filter (excludes CNRCI betwween -2 and 0) applied to each gene's mean across cell lines.   

In [12]:
# LncLocator2 style threshold
def count_genes_between_thresholds(all_lines,low,high):
    '''
    Input: a mapping of cell line name to a list of CNRCI values.
    Filter: count genes with CNRCI value between low and high parameter values.
    Result: Print a table of gene count statistics per cell line.  
    '''
    print("Cell line, total, low, middle, high, mid%")
    for cell_line in all_lines.keys():
        list_of_cnrci = all_lines[cell_line]
        ary = np.array(list_of_cnrci)
        total = len(ary)
        above_P1 = np.count_nonzero(ary > high)
        below_N1 = np.count_nonzero(ary < low)
        middle = total - (above_P1+below_N1)
        portion = middle/total
        print ("{0:10s} {1:5d} {2:5d} {3:5d} {4:5d} {5:5.2f}".format( 
            cell_line, total, above_P1, middle, below_N1, portion))

In [13]:
count_genes_between_thresholds(nc_lines,-1,1)

Cell line, total, low, middle, high, mid%
MCF.7       3114   365   931  1818  0.30
A549        2129   360   997   772  0.47
GM12878     2511   175  1042  1294  0.41
H1.hESC     4923   750  2476  1697  0.50
HeLa.S3     1317   126   385   806  0.29
HepG2       2014   199   702  1113  0.35
HT1080      1361   266   583   512  0.43
HUVEC       2214   264   748  1202  0.34
IMR.90       582   118   222   242  0.38
K562        1402   145   610   647  0.44
NCI.H460     930    65   301   564  0.32
NHEK        1588   185   620   783  0.39
SK.MEL.5     808    63   250   495  0.31
SK.N.DZ      907   118   473   316  0.52
SK.N.SH     2417   319   793  1305  0.33


In [14]:
# RNAlight style threshold
def RNAlight_1(exclude):
    '''
    This function is similar to values_per_gene() above,
    but instead of returning one count per gene,
    return a list of values per gene.
    '''
    values_per_gene = dict()
    coding_type = 'nc'
    with open (FILEPATH,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
                continue
            count_rci = 0
            fields = row.strip().split(',')
            if fields[5]!=coding_type or fields[2]!=RATIO_TYPE or fields[3]=='NA' or fields[1] in exclude:
                continue
            gene_id = fields[0]
            cell_line = fields[1]
            cnrci = float(fields[3])
            if gene_id not in values_per_gene:
                values_per_gene[gene_id] = []
            values_per_gene[gene_id].append(cnrci)
    return values_per_gene 
def RNAlight_2(values_per_gene):
    '''
    Given a map of gene ID to CNRCI values,
    return a map of gene ID to its mean CNRCI.
    '''
    mean_per_gene = dict()
    for gene in values_per_gene.keys():
        values = values_per_gene[gene]
        mean_per_gene[gene] = np.mean(values)
    return mean_per_gene
def RNAlight_3(mean_per_gene):
    '''
    Given a map of gene ID to its mean CNRCI,
    return counts of genes above, below, and in the middle range.
    '''
    means = np.array(list(mean_per_gene.values()))
    total = len(means)
    above = np.count_nonzero(means > 0)
    below = np.count_nonzero(means < -2)
    middle = total - (above+below)
    return above, middle, below

In [15]:
print("RNAlight treatment of long non-coding RNA genes")
excluded = 'H1.hESC'
print("Excluding this cell line:", excluded)
values_per_gene = RNAlight_1(excluded)
mean_per_gene = RNAlight_2(values_per_gene)
above, middle, below = RNAlight_3(mean_per_gene)
total=above+middle+below
print("Cytoplasmic genes", above)
print("Nuclear genes", below)
print("Excluded middle", middle)
print("Middle portion {0:5.2f}%".format(100.0*middle/total))

RNAlight treatment of long non-coding RNA genes
Excluding this cell line: H1.hESC
Cytoplasmic genes 1525
Nuclear genes 1983
Excluded middle 2252
Middle portion 39.10%


## Correlations
Define functions to compute correlations.   
Finally, compute the correlations.    

In [21]:
def get_all_data(coding=True,filtered=False):
    '''
    This function is similar to values_per_gene() above,
    but instead of returning one count per gene,
    return a map of maps: 
    for each cell line, map each gene to one CNRCI value.
    '''
    cell_lines=dict()
    if coding:
        coding_type = 'coding'
    else:
        coding_type = 'nc'
    with open (FILEPATH,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row 
                continue
            count_rci = 0
            fields = row.strip().split(',')
            if fields[5]!=coding_type or fields[2]!=RATIO_TYPE or fields[3]=='NA':
                continue
            gene_id = fields[0]
            cell_line = fields[1]
            cnrci = float(fields[3])
            if filtered and cnrci >= -2 and cnrci <= 0:
                continue
            if cell_line not in cell_lines:
                cell_lines[cell_line] = dict()
            cell_lines[cell_line][gene_id]=cnrci
    return cell_lines

In [22]:
all_data = get_all_data(coding=False)

In [35]:
def correlation_matrix(cell_lines):
    '''
    Compute and print the matrix of all-vs-all pairwise correlations.
    Show the Pearson correlation coefficients.
    Compute correlations only on genes in common between each cell line pair.
    '''
    sorted_lines = sorted(list(cell_lines.keys()))
    print('Line',end=',')
    for key in sorted_lines:
        print(key,end=',')
    print()
    for row_key in sorted_lines:
        print('%10s' % row_key, end=',')
        row_gene_dict = cell_lines[row_key]
        row_genes = set(row_gene_dict.keys())
        row_avg = []
        for col_key in sorted_lines:
            col_gene_dict = cell_lines[col_key]
            col_genes = set(col_gene_dict.keys())
            common_genes = row_genes.intersection(col_genes)
            row_values = np.array([row_gene_dict[gene] for gene in common_genes])
            col_values = np.array([col_gene_dict[gene] for gene in common_genes])
            r,p = ss.pearsonr(row_values,col_values)
            print('%.2f' % r, end=',')
            if row_key != col_key:
                row_avg.append(r)
        print(' avg=%.2f'%np.mean(row_avg))

### All-vs-all correlation of non-coding genes
Finally, run the code above to create tables of all pairwise correlations.    
Each cell line has CNRCI values for a subset of genes.
Correlation is measured only on the genes in common between two cell lines.       
High correlation means the cell lines that share genes have agreeing CNRCI for those genes.   
Look for any cell lines that have low correlation with the others.    
First, show a table for the raw data.     
Second, show it again but filter CNRCI values by middle exclusion with RNAlight thresholds.

In [38]:
print('CNRCI correlations between cell lines, no filtering.')
sparse_matrix = get_all_data()
correlation_matrix(sparse_matrix)

CNRCI correlations between cell lines, no filtering.
Line,A549,GM12878,H1.hESC,HT1080,HUVEC,HeLa.S3,HepG2,IMR.90,K562,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,
      A549,1.00,0.71,0.73,0.74,0.72,0.82,0.78,0.78,0.69,0.76,0.56,0.79,0.56,0.66,0.86, avg=0.73
   GM12878,0.71,1.00,0.58,0.70,0.80,0.83,0.84,0.78,0.89,0.77,0.55,0.68,0.70,0.76,0.72, avg=0.74
   H1.hESC,0.73,0.58,1.00,0.51,0.55,0.71,0.68,0.63,0.65,0.58,0.38,0.59,0.38,0.55,0.72, avg=0.59
    HT1080,0.74,0.70,0.51,1.00,0.81,0.76,0.73,0.78,0.75,0.76,0.59,0.83,0.74,0.66,0.73, avg=0.72
     HUVEC,0.72,0.80,0.55,0.81,1.00,0.80,0.83,0.91,0.85,0.86,0.58,0.78,0.81,0.73,0.78, avg=0.77
   HeLa.S3,0.82,0.83,0.71,0.76,0.80,1.00,0.87,0.80,0.85,0.81,0.59,0.79,0.72,0.74,0.84, avg=0.78
     HepG2,0.78,0.84,0.68,0.73,0.83,0.87,1.00,0.83,0.86,0.83,0.55,0.74,0.72,0.77,0.84, avg=0.78
    IMR.90,0.78,0.78,0.63,0.78,0.91,0.80,0.83,1.00,0.82,0.86,0.53,0.74,0.82,0.74,0.87, avg=0.78
      K562,0.69,0.89,0.65,0.75,0.85,0.85,0.86,0.82,1.00,0.81,0.56,0.

In [39]:
print('CNRCI correlations between cell lines, filtered similar to RNAlight.')
filtered_matrix = get_all_data(filtered=True)
correlation_matrix(filtered_matrix)

CNRCI correlations between cell lines, filtered similar to RNAlight.
Line,A549,GM12878,H1.hESC,HT1080,HUVEC,HeLa.S3,HepG2,IMR.90,K562,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,
      A549,1.00,0.79,0.79,0.81,0.81,0.87,0.84,0.86,0.77,0.84,0.74,0.86,0.67,0.74,0.91, avg=0.81
   GM12878,0.79,1.00,0.66,0.79,0.88,0.89,0.91,0.85,0.93,0.86,0.73,0.80,0.80,0.84,0.83, avg=0.82
   H1.hESC,0.79,0.66,1.00,0.57,0.62,0.75,0.73,0.70,0.72,0.65,0.53,0.65,0.43,0.61,0.79, avg=0.66
    HT1080,0.81,0.79,0.57,1.00,0.88,0.84,0.81,0.85,0.82,0.83,0.75,0.90,0.81,0.71,0.84, avg=0.80
     HUVEC,0.81,0.88,0.62,0.88,1.00,0.89,0.90,0.95,0.90,0.91,0.76,0.89,0.89,0.80,0.89, avg=0.86
   HeLa.S3,0.87,0.89,0.75,0.84,0.89,1.00,0.93,0.87,0.91,0.89,0.73,0.87,0.82,0.80,0.91, avg=0.86
     HepG2,0.84,0.91,0.73,0.81,0.90,0.93,1.00,0.90,0.92,0.90,0.71,0.85,0.82,0.83,0.91, avg=0.85
    IMR.90,0.86,0.85,0.70,0.85,0.95,0.87,0.90,1.00,0.88,0.91,0.72,0.87,0.89,0.80,0.93, avg=0.85
      K562,0.77,0.93,0.72,0.82,0.90,0.91,0.92,0.88,1