In [1]:
import polars as pl

In [2]:
gtf = pl.read_csv(
    '/iblm/netapp/data4/shared_dir/hyena_dna_collab/downstream_tasks/iPSCORE_cvpc/STAR/hg38.knownGene.gtf',
    separator='\t',
    has_header=False,
    new_columns=['chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
)
gtf.head()

chrom,source,feature,start,end,score,strand,frame,attribute
str,str,str,i64,i64,str,str,str,str
"""chr1""","""knownGene""","""transcript""",11869,14409,""".""","""+""",""".""","""gene_id ""ENST0…"
"""chr1""","""knownGene""","""exon""",11869,12227,""".""","""+""",""".""","""gene_id ""ENST0…"
"""chr1""","""knownGene""","""exon""",12613,12721,""".""","""+""",""".""","""gene_id ""ENST0…"
"""chr1""","""knownGene""","""exon""",13221,14409,""".""","""+""",""".""","""gene_id ""ENST0…"
"""chr1""","""knownGene""","""transcript""",17369,17436,""".""","""-""",""".""","""gene_id ""ENST0…"


In [3]:
transcripts = (
    gtf
    .filter(pl.col('feature') == 'transcript')
    .with_columns(enst=pl.col('attribute').str.extract(r'gene_id "([^"]+)";'))
    .filter(pl.col('enst').str.starts_with('ENST'))
)
transcripts.head()

chrom,source,feature,start,end,score,strand,frame,attribute,enst
str,str,str,i64,i64,str,str,str,str,str
"""chr1""","""knownGene""","""transcript""",11869,14409,""".""","""+""",""".""","""gene_id ""ENST0…","""ENST0000045632…"
"""chr1""","""knownGene""","""transcript""",17369,17436,""".""","""-""",""".""","""gene_id ""ENST0…","""ENST0000061921…"
"""chr1""","""knownGene""","""transcript""",29554,31097,""".""","""+""",""".""","""gene_id ""ENST0…","""ENST0000047335…"
"""chr1""","""knownGene""","""transcript""",30267,31109,""".""","""+""",""".""","""gene_id ""ENST0…","""ENST0000046928…"
"""chr1""","""knownGene""","""transcript""",30366,30503,""".""","""+""",""".""","""gene_id ""ENST0…","""ENST0000060709…"


In [4]:
(
    transcripts
    .select('chrom', 'start', 'end', 'enst')
    .write_csv(
        '/iblm/netapp/data4/shared_dir/hyena_dna_collab/downstream_tasks/iPSCORE_cvpc/transcripts.bed',
        separator='\t',
        include_header=False
    )
)

In [7]:
ens = pl.read_csv(
    'ens.tsv.gz',
    separator='\t',
)

In [15]:
ens.head()

Gene stable ID,Transcript stable ID,Transcript stable ID version,Gene stable ID version,Ensembl Canonical,Gene start (bp),Gene end (bp),Chromosome/scaffold name,Gene name
str,str,str,str,i64,i64,i64,str,str
"""ENSG00000210049""","""ENST00000387314""","""ENST00000387314.1""","""ENSG00000210049.1""",1,577,647,"""MT""","""MT-TF"""
"""ENSG00000211459""","""ENST00000389680""","""ENST00000389680.2""","""ENSG00000211459.2""",1,648,1601,"""MT""","""MT-RNR1"""
"""ENSG00000210077""","""ENST00000387342""","""ENST00000387342.1""","""ENSG00000210077.1""",1,1602,1670,"""MT""","""MT-TV"""
"""ENSG00000210082""","""ENST00000387347""","""ENST00000387347.2""","""ENSG00000210082.2""",1,1671,3229,"""MT""","""MT-RNR2"""
"""ENSG00000209082""","""ENST00000386347""","""ENST00000386347.1""","""ENSG00000209082.1""",1,3230,3304,"""MT""","""MT-TL1"""


In [9]:
pl.Config.set_fmt_str_lengths(30)

polars.config.Config

In [18]:
(
    transcripts
    .join(ens, left_on='enst', right_on='Transcript stable ID version')
    .filter(pl.col('Ensembl Canonical') == 1)
    .select('Chromosome/scaffold name', 'Gene start (bp)', 'Gene end (bp)', 'Gene stable ID')
    .unique()
    .sort('Chromosome/scaffold name', 'Gene start (bp)', 'Gene end (bp)')
    .write_csv(
        '/iblm/netapp/data4/shared_dir/hyena_dna_collab/downstream_tasks/iPSCORE_cvpc/genes.bed',
        include_header=False,
        separator='\t'
    )
)