In [41]:
import pandas, numpy, scipy
from scipy.stats import hypergeom

In [42]:
file_directory = "/Users/kja11/OneDrive - Menntaský/PostDoc_Hypothermia/in_silico/Python/"

# Define Paths

In [43]:
# proteomic data
proteomics_path = file_directory+'3) output/Proteomics/'

# RNAseq data
RNAseq_path = file_directory+'3) output/RNAseq/'
RNAseq_path2 = file_directory+'1) input/RNAseq/'

# GeCKO data
GeCKO_path = file_directory+'1) input/GeCKO/'

# Open the files

In [44]:
#open .txt
## Proteomic
## Linear regression Slope
with open(proteomics_path+'signif_linregress_proteins.txt') as f:
    proteomics_pval = f.read().splitlines()

with open(proteomics_path+'adjPvalue_linregress_proteins.txt') as f:
    proteomics_adjpval = f.read().splitlines() 
    
with open(proteomics_path+'high_Rvalue_linregress_proteins.txt') as f:
    proteomics_high_rval = f.read().splitlines() 
    
## Early response Coefficiant of Variation    
with open(proteomics_path+'Early_res_CoeffVar_proteins_DOWN.txt') as f:
    Early_res_CoeffVar_proteins_DOWN = f.read().splitlines()
    
with open(proteomics_path+'Early_res_CoeffVar_proteins_UP.txt') as f:
    Early_res_CoeffVar_proteins_UP = f.read().splitlines()

## RNAseq
with open(RNAseq_path+'allsignif_genes_HEK293.txt') as f:
    RNAseq_HEK293 = f.read().splitlines()
    
print("Early_res_CoeffVar_proteins_DOWN:", len(Early_res_CoeffVar_proteins_DOWN))
print("Early_res_CoeffVar_proteins_UP:", len(Early_res_CoeffVar_proteins_UP))
print("proteomics_pval:", len(proteomics_pval))
print("proteomics_adjpval:", len(proteomics_adjpval))
print("proteomics_high_rval:", len(proteomics_high_rval))
print("RNAseq_HEK293:", len(RNAseq_HEK293))

Early_res_CoeffVar_proteins_DOWN: 95
Early_res_CoeffVar_proteins_UP: 84
proteomics_pval: 532
proteomics_adjpval: 22
proteomics_high_rval: 302
RNAseq_HEK293: 139


In [45]:
# Open the .csv file
# GeCKO screens data
GeCKO_sp1rep = pandas.read_csv(GeCKO_path+'GeCKO_SP1_repressors.csv', sep = ',')['Gene Symbol'].str.upper()
GeCKO_sp1activ = pandas.read_csv(GeCKO_path+'GeCKO_SP1_activators.csv', sep = ',')['Gene Symbol'].str.upper()
print("GeCKO SP1 repressors:", len(GeCKO_sp1rep), "\nGeCKO SP1 activators:", len(GeCKO_sp1activ))

GeCKO_rmb3rep = pandas.read_csv(GeCKO_path+'GeCKO_RBM3_repressors.csv', sep = ',')['Gene Symbol'].str.upper()
print("GeCKO RMB3 repressor:", len(GeCKO_rmb3rep), '\n')

# RNAseq Mice
## Cortex
RNAseq_cortex_down = pandas.read_csv(RNAseq_path2+'DESeq_in_vivo_cortex_temp_downregulated.csv', sep = ',')['gene'].str.upper()
RNAseq_cortex_up = pandas.read_csv(RNAseq_path2+'DESeq_in_vivo_cortex_temp_upregulated.csv', sep = ',')['gene'].str.upper()
print("RNAseq_cortex_down:", len(RNAseq_cortex_down), "\nRNAseq_cortex_up:", len(RNAseq_cortex_up))

## Hippocampus
RNAseq_hippoc_down = pandas.read_csv(RNAseq_path2+'DESeq_in_vivo_hippocampus_temp_downregulated.csv', sep = ',')['gene'].str.upper()
RNAseq_hippoc_up = pandas.read_csv(RNAseq_path2+'DESeq_in_vivo_hippocampus_temp_upregulated.csv', sep = ',')['gene'].str.upper()
print("RNAseq_hippoc_down:", len(RNAseq_hippoc_down), "\nRNAseq_hippoc_up:", len(RNAseq_hippoc_up))

GeCKO SP1 repressors: 495 
GeCKO SP1 activators: 61
GeCKO RMB3 repressor: 621 

RNAseq_cortex_down: 2446 
RNAseq_cortex_up: 2399
RNAseq_hippoc_down: 1726 
RNAseq_hippoc_up: 1990


# Overlap

In [46]:
def hypergeometric_test(total_genes, list1, list2):
    n_total = total_genes  # Total number of genes
    n1 = len(list1)   # Number of genes in list1
    n2 = len(list2)   # Number of genes in list2
    overlap = len(set(list1).intersection(set(list2)))  # Overlapping genes

    # Calculate the p-value using the hypergeometric distribution
    p_value = hypergeom.sf(overlap - 1, n_total, n1, n2)
    return p_value

In [47]:
# Convert lists to sets and find the intersection
print('overlapping Proteomic and GeCKO screens:\n')

datas1 = [proteomics_high_rval, proteomics_adjpval, 
          Early_res_CoeffVar_proteins_DOWN, Early_res_CoeffVar_proteins_UP]
labels1 = ['Proteomic-HEK293 rvalue >0.6 (abs)', 'adjpvalue <0.05 Proteomic-HEK293',
          'early_response_proteomic_DOWN-HEK293', 'early_response_proteomic_UP-HEK293'] 

datas2 = [GeCKO_sp1rep, GeCKO_sp1activ, GeCKO_rmb3rep]
labels2 = ['GeCKO_sp1rep', 'GeCKO_sp1activ', 'GeCKO_rmb3rep']

for i in range(len(datas1)):
    data1 = datas1[i]
    label1 = labels1[i]
    
    for y in range(len(datas2)):
        data2 = datas2[y]
        label2 = labels2[y]
        
        # Hypergeometric test
        total_genes = 20000  # Total number of genes in the genome
        list1 = data1
        list2 = data2

        p_value = hypergeometric_test(total_genes, list1, list2)
        print(f'Occurred by chance P-value:, {p_value}')
        
        # Print the overlapping ones
        common_elements = set(data1).intersection(data2)
        common_elements = list(common_elements)
        print(len(common_elements), f"overlap {label1} and the {label2}:\n", common_elements, "\n")

overlapping Proteomic and GeCKO screens:

Occurred by chance P-value:, 0.4730012984003374
8 overlap Proteomic-HEK293 rvalue >0.6 (abs) and the GeCKO_sp1rep:
 ['DCTN3', 'RPS8', 'RBM8A', 'RPL13', 'ARFGAP1', 'SMYD5', 'MATR3', 'NOC3L'] 

Occurred by chance P-value:, 0.6052611232625265
1 overlap Proteomic-HEK293 rvalue >0.6 (abs) and the GeCKO_sp1activ:
 ['HNRNPDL'] 

Occurred by chance P-value:, 0.4632709146400395
10 overlap Proteomic-HEK293 rvalue >0.6 (abs) and the GeCKO_rmb3rep:
 ['PGRMC1', 'SEMG1', 'RPS11', 'RCN1', 'RPS23', 'RPS6', 'CALD1', 'HNRNPL', 'AGPS', 'DDX39A'] 

Occurred by chance P-value:, 0.1020594717718253
2 overlap adjpvalue <0.05 Proteomic-HEK293 and the GeCKO_sp1rep:
 ['RBM8A', 'RPL13'] 

Occurred by chance P-value:, 1.0
0 overlap adjpvalue <0.05 Proteomic-HEK293 and the GeCKO_sp1activ:
 [] 

Occurred by chance P-value:, 0.1481055270747281
2 overlap adjpvalue <0.05 Proteomic-HEK293 and the GeCKO_rmb3rep:
 ['HNRNPL', 'AGPS'] 

Occurred by chance P-value:, 1.0
0 overlap ear

In [48]:
# Convert lists to sets and find the intersection
print('overlapping Proteomic and RNAseq:\n')

datas1 = [proteomics_high_rval, proteomics_adjpval, 
          Early_res_CoeffVar_proteins_DOWN, Early_res_CoeffVar_proteins_UP]
labels1 = ['Proteomic-HEK293 rvalue >0.6 (abs)', 'adjpvalue <0.05 Proteomic-HEK293',
          'early_response_proteomic_DOWN-HEK293', 'early_response_proteomic_UP-HEK293'] 

datas2 = [RNAseq_HEK293, RNAseq_hippoc_down, RNAseq_hippoc_up, RNAseq_cortex_down, RNAseq_cortex_up]
labels2 = ['RNAseq_HEK293','RNAseq_hippoc_down','RNAseq_hippoc_up','RNAseq_cortex_down','RNAseq_cortex_up']

for i in range(len(datas1)):
    data1 = datas1[i]
    label1 = labels1[i]
    
    for y in range(len(datas2)):
        data2 = datas2[y]
        label2 = labels2[y]
        
        # Hypergeometric test
        total_genes = 20000  # Total number of genes in the genome
        list1 = data1
        list2 = data2

        p_value = hypergeometric_test(total_genes, list1, list2)
        print(f'Occurred by chance P-value:, {p_value}')
        
        # Print the overlapping ones
        common_elements = set(data1).intersection(data2)
        common_elements = list(common_elements)
        print(len(common_elements), f"overlap {label1} and the {label2}:\n", common_elements, "\n")
        
        


overlapping Proteomic and RNAseq:

Occurred by chance P-value:, 0.01908445627655189
6 overlap Proteomic-HEK293 rvalue >0.6 (abs) and the RNAseq_HEK293:
 ['CIRBP', 'CHORDC1', 'SRSF5', 'HNRNPDL', 'NOC3L', 'RBM3'] 

Occurred by chance P-value:, 0.23465068422162
30 overlap Proteomic-HEK293 rvalue >0.6 (abs) and the RNAseq_hippoc_down:
 ['CHORDC1', 'MRPS25', 'ADAR', 'PPIG', 'HSPA1B', 'NOC3L', 'LIN7A', 'DPYSL2', 'BAZ1B', 'SREK1', 'PPIL1', 'SRSF7', 'LMAN2', 'SERBP1', 'PRPF3', 'GOLPH3', 'NUFIP2', 'ELAVL1', 'SAMM50', 'CCAR1', 'HSP90AA1', 'DNAJB1', 'PNN', 'LARP1', 'JAGN1', 'ERP44', 'CASC3', 'SF3A1', 'RCN1', 'SPTBN1'] 

Occurred by chance P-value:, 0.0004180495514084946
49 overlap Proteomic-HEK293 rvalue >0.6 (abs) and the RNAseq_hippoc_up:
 ['WDR43', 'GLUD1', 'SLC25A11', 'RPL13', 'GCSH', 'PACSIN3', 'TCERG1', 'TIMM23', 'TBCD', 'RBM34', 'HNRNPA0', 'ADH5', 'CIRBP', 'FUS', 'RPL36', 'ACAA2', 'ARFGAP1', 'NDUFS1', 'NELFE', 'SDHB', 'TIMM10B', 'MTHFD1L', 'DPM1', 'RPS20', 'HNRNPDL', 'BCLAF1', 'MRPL3', 'TH