In [1]:
CONSERVATION_PATH = 'data/conservation/hg19.100way.phyloP100way.bw'
# CONSERVATION_PATH = 'data/conservation/hg38.phyloP100way.bw'
UTR_DF_MIRNA_FC_CHR_PATH = 'data/3utr/3utr.sequences.refseq_id.mirna_fc.chr.pkl'
UTR_DF_MIRNA_FC_CHR_CONSERVATION_PATH = 'data/conservation/3utr.sequences.refseq_id.mirna_fc.chr.conservation.pkl'


In [2]:
import pandas as pd

df = pd.read_pickle(UTR_DF_MIRNA_FC_CHR_PATH)

In [3]:
import pyBigWig

def get_conservation(bw_file, chrom, ensembl_start, ensembl_end):
    # Adjust Ensembl coordinates to UCSC format (0-based, half-open at the end)
    ucsc_start = ensembl_start - 1
    ucsc_end = ensembl_end  # The end coordinate remains the same

    conservation_scores = bw_file.values('chr' + chrom, ucsc_start, ucsc_end)

    return conservation_scores

In [5]:
import numpy as np

bw_file = pyBigWig.open(CONSERVATION_PATH)


df_conservation = []
errors = []
error_loci = []

for row in df.iterrows():
    chrom = row[1].chromosome
    ensembl_start = row[1].utr3_start  # Ensembl coordinates (1-based, inclusive)
    ensembl_end = row[1].utr3_end  # Ensembl coordinates (1-based, inclusive)
    strand = row[1].strand
    
    ensembl_start = [int(x) for x in ensembl_start.split(';')]
    ensembl_end = [int(x) for x in ensembl_end.split(';')]
    
    if len(ensembl_start) > 1:
        ensembl_start.sort()
        ensembl_end.sort()
    
    row_conservation = []
    for start, end in zip(ensembl_start, ensembl_end):
        try:
            exon_conservation = get_conservation(bw_file, chrom, start, end)
            row_conservation.extend(exon_conservation)
        except RuntimeError as er:
            errors.append(er)
            error_loci.append((start, end, chrom))

    df_conservation.append(row_conservation)


bw_file.close()
df['conservation_phylo'] = df_conservation

In [8]:
# bw_file = pyBigWig.open(CONSERVATION_PATH)
# # tmp_chr='HSCHR11_1_CTG7'
# tmp_chr='chr17'
# # print(bw_file.values(tmp_chr, 80169, 81275))
# print(bw_file.values(tmp_chr, 82442593, 82442645))
# bw_file.close()

In [None]:
df.to_pickle(UTR_DF_MIRNA_FC_CHR_CONSERVATION_PATH)

In [None]:
df_has_NA = df.conservation_phylo.map(lambda x: pd.isna(sum(x)))
df_has_NA, df_has_NA.sum()

In [None]:
df[df.apply(lambda row: len(row.sequence) != len(row.conservation_phylo), axis=1)]