# Analyze RCI correlations between cell lines

This notebook is derived from one called AnalyzeData_105.
Use lncATLAS, all cell lines, all genes (without partitioning separate train:test sets).
Use antilog for computing the mean per gene.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import scipy.stats as ss
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2024-06-04 13:15:03.773105
Python 3.11.6
sklearn 1.3.2


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR = './'    # Mac
print(DATA_DIR)

./


In [3]:
RCI_FILE = 'lncATLAS_all_data_RCI.csv'
filepath = DATA_DIR + RCI_FILE

## Cell Lines

In [4]:
def load_cell_lines(filepath):
    header = None
    names = list()
    with open (filepath,'r') as handle:
        for row in handle:
            line = row.strip()
            fields = line.split(',')
            if header is None:
                header = row 
                continue
            gene_id = fields[0]
            cell_line = fields[1]
            if cell_line not in names:
                names.append(cell_line)
    names.sort()
    return names

In [5]:
CELL_LINES = load_cell_lines(RCI_FILE)
num_cell_lines = len(CELL_LINES)
print(CELL_LINES, num_cell_lines, "total", )

['A549', 'GM12878', 'H1.hESC', 'HT1080', 'HUVEC', 'HeLa.S3', 'HepG2', 'IMR.90', 'K562', 'MCF.7', 'NCI.H460', 'NHEK', 'SK.MEL.5', 'SK.N.DZ', 'SK.N.SH'] 15 total


## Correlations

Expect lncATLAS file like this...    
ENSEMBL ID,Data Source,Data Type,Value,Gene Name,Coding Type,Biotype     
ENSG00000000003,A549,CNRCI,1.08068,TSPAN6,coding,coding     
ENSG00000283125,SK.N.SH,CNRCI,NA,RP11-299P2.2,nc,nc     

In [6]:
def load_RCI_data(filepath,coding_choice):
    if coding_choice not in ['coding','nc']:
        raise Exception('Unrecognized choice')
    cell_line_maps  = []  # 15 maps for 15 cell lines, each maps one gene name to one RCI value
    for i in CELL_LINES:
        cell_map = dict()
        cell_line_maps.append(cell_map)
    with open (filepath,'r') as handle:
        header = None
        for row in handle:
            line = row.strip()
            fields = line.split(',')
            if header is None:
                header = row 
                continue
            gene_id = fields[0]
            cell_line = fields[1]
            rci_type = fields[2]
            rci_value = fields[3]
            gene_name = fields[4]
            coding_type = fields[5]
            if coding_type != coding_choice or rci_type != 'CNRCI' or rci_value == 'NA':
                continue
            real_value = float(rci_value)
            cell_line_index = CELL_LINES.index(cell_line)
            cell_map = cell_line_maps[cell_line_index]
            if gene_id in cell_map.keys():
                print(row)
                raise Exception('Unexpected second value for this gene in this cell line')
            cell_map[gene_id] = real_value 
    return cell_line_maps

In [7]:
def all_vs_all_correlation(maps):
    NUM = len(maps)
    averages_per_line = list()
    for i in range(NUM):
        print(CELL_LINES[i],end=',') # header line
    print()
    for i in range(NUM):
        i_genes = set(maps[i].keys())
        values_for_this_line = []
        for j in range(NUM):
            if i==j:
                r = 1.0  # ignore correlation of cell line to itself
            else: 
                j_genes = set(maps[j].keys())
                common_genes = i_genes.intersection(j_genes)
                i_values = np.array([maps[i][k] for k in common_genes])
                j_values = np.array([maps[j][k] for k in common_genes])
                r,p = ss.pearsonr(i_values,j_values)
            print('%.2f' % r, end=',')
            values_for_this_line.append(r)
        print()
        average = np.mean(values_for_this_line)
        averages_per_line.append(average)
    print('Average correlation to other cell lines:')
    for i in range(NUM):
        print('%10s %.2f' % (CELL_LINES[i],averages_per_line[i]))
    

## Coding

In [8]:
print('All vs all correlation')
cl_maps = load_RCI_data(filepath,'coding')
all_vs_all_correlation(cl_maps)

All vs all correlation
A549,GM12878,H1.hESC,HT1080,HUVEC,HeLa.S3,HepG2,IMR.90,K562,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,
1.00,0.71,0.73,0.74,0.72,0.82,0.78,0.78,0.69,0.76,0.56,0.79,0.56,0.66,0.86,
0.71,1.00,0.58,0.70,0.80,0.83,0.84,0.78,0.89,0.77,0.55,0.68,0.70,0.76,0.72,
0.73,0.58,1.00,0.51,0.55,0.71,0.68,0.63,0.65,0.58,0.38,0.59,0.38,0.55,0.72,
0.74,0.70,0.51,1.00,0.81,0.76,0.73,0.78,0.75,0.76,0.59,0.83,0.74,0.66,0.73,
0.72,0.80,0.55,0.81,1.00,0.80,0.83,0.91,0.85,0.86,0.58,0.78,0.81,0.73,0.78,
0.82,0.83,0.71,0.76,0.80,1.00,0.87,0.80,0.85,0.81,0.59,0.79,0.72,0.74,0.84,
0.78,0.84,0.68,0.73,0.83,0.87,1.00,0.83,0.86,0.83,0.55,0.74,0.72,0.77,0.84,
0.78,0.78,0.63,0.78,0.91,0.80,0.83,1.00,0.82,0.86,0.53,0.74,0.82,0.74,0.87,
0.69,0.89,0.65,0.75,0.85,0.85,0.86,0.82,1.00,0.81,0.56,0.72,0.77,0.79,0.75,
0.76,0.77,0.58,0.76,0.86,0.81,0.83,0.86,0.81,1.00,0.60,0.75,0.81,0.74,0.82,
0.56,0.55,0.38,0.59,0.58,0.59,0.55,0.53,0.56,0.60,1.00,0.53,0.63,0.62,0.59,
0.79,0.68,0.59,0.83,0.78,0.79,0.74,

## Non-coding

In [9]:
print('All vs all correlation')
cl_maps = load_RCI_data(filepath,'nc')
all_vs_all_correlation(cl_maps)

All vs all correlation
A549,GM12878,H1.hESC,HT1080,HUVEC,HeLa.S3,HepG2,IMR.90,K562,MCF.7,NCI.H460,NHEK,SK.MEL.5,SK.N.DZ,SK.N.SH,
1.00,0.74,0.57,0.76,0.81,0.72,0.81,0.83,0.71,0.80,0.59,0.80,0.65,0.73,0.82,
0.74,1.00,0.46,0.68,0.79,0.72,0.81,0.72,0.84,0.71,0.57,0.70,0.66,0.74,0.67,
0.57,0.46,1.00,0.39,0.49,0.55,0.53,0.61,0.58,0.39,0.31,0.42,0.25,0.46,0.53,
0.76,0.68,0.39,1.00,0.84,0.76,0.80,0.85,0.73,0.83,0.67,0.81,0.82,0.73,0.82,
0.81,0.79,0.49,0.84,1.00,0.75,0.86,0.90,0.81,0.86,0.66,0.82,0.77,0.74,0.84,
0.72,0.72,0.55,0.76,0.75,1.00,0.78,0.76,0.77,0.73,0.66,0.79,0.77,0.71,0.80,
0.81,0.81,0.53,0.80,0.86,0.78,1.00,0.84,0.81,0.82,0.62,0.77,0.72,0.75,0.81,
0.83,0.72,0.61,0.85,0.90,0.76,0.84,1.00,0.75,0.86,0.61,0.82,0.79,0.75,0.88,
0.71,0.84,0.58,0.73,0.81,0.77,0.81,0.75,1.00,0.75,0.55,0.67,0.72,0.78,0.68,
0.80,0.71,0.39,0.83,0.86,0.73,0.82,0.86,0.75,1.00,0.63,0.80,0.79,0.73,0.82,
0.59,0.57,0.31,0.67,0.66,0.66,0.62,0.61,0.55,0.63,1.00,0.61,0.72,0.67,0.65,
0.80,0.70,0.42,0.81,0.82,0.79,0.77,