### Database creation of Telomere-to-telomere consortium CHM13 Annotations

##### Before you begin, please make sure you have the proper files downloaded

In [9]:
#Download link for CHM13 Annotations: https://github.com/marbl/CHM13?tab=readme-ov-file
##Scroll Down to find the UCSC GENCODEv35 CAT/Liftoff v2 Annotation file, which will be used for this database

In [1]:
# ! python3 -m pip install gffutils

In [1]:
import pandas as pd
import numpy as np
import gffutils
import pysam

##### If this is your first time importing the gff3 database, run this command

In [2]:
# This can take a long time, so only run this once (remove the #)
# gene_annotation_gff3_chm13_v2_path ='/home/michalula/code/Ella/data/chm13.draft_v2.0.gene_annotation.gff3'

# db = gffutils.create_db(gene_annotation_gff3_chm13_v2_path,
#                          dbfn='chm13db.db', force=True, keep_order=True, 
# merge_strategy='create_unique', sort_attribute_values=True) 

# replace gff3 with your gff3 file, dbfn can be whichever name you want to store the transposed file as


##### Feature the selected database 

In [4]:
gene_annotation_DB_chm13_v2_path = '/home/michalula/code/Ella/data/t2tv2_0_gene_annotation_chm13db.db'  # '/home/michalula/code/Ella/data/chm13db.db'
db = gffutils.FeatureDB(gene_annotation_DB_chm13_v2_path, keep_order=True)
db

<gffutils.interface.FeatureDB at 0x7453feb8e850>

##### Select the desired range

In [9]:
slct = db.region(region=('chr1', 206586910, 206587029), completely_within=False)
# t2tv2.0: chr: 206586910 -	206587029
# slct = db.region(region=('chr1', 206560169, 206614236), completely_within=False)
slct

<generator object FeatureDB.region at 0x7453e85c0cc0>

##### Iterate through the region and transpose it to a dictionary

In [10]:
records = []
for gene in slct:
    records.append({
        "seqid": gene.seqid,
        "source": gene.source,
        "featuretype": gene.featuretype,
        "start": gene.start,
        "end": gene.end,
        "strand": gene.strand,
        "score": gene.score,
        "attributes": gene.attributes  # Attributes as a dictionary (will be expanded later)
    })

##### Expand out the attributes and format the dictionary as a database, dropping unnecessary columns

In [11]:
df = pd.DataFrame(records)
attributes_df = df["attributes"].apply(pd.Series)
df = pd.concat([df.drop(columns=["attributes"]), attributes_df], axis=1)


In [12]:
for col in df.columns:
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)
df.dropna(how='all', axis=1, inplace=True)
df

Unnamed: 0,seqid,source,featuretype,start,end,strand,score,source_transcript,source_transcript_name,source_gene,...,Name,rna_support,reference_support,gene_name,alternative_source_transcripts,collapsed_gene_ids,collapsed_gene_names,frameshift,extra_paralog,exon_anotation_support
0,chr1,CAT,exon,206586828,206587161,+,.,ENST00000367063.6,CD55-203,ENSG00000196352.16,...,CD55,,True,CD55,,,,,False,
1,chr1,CAT,transcript,206586828,206606065,+,10000,ENST00000367063.6,CD55-203,ENSG00000196352.16,...,CD55,,,CD55,,,,,False,
2,chr1,CAT,gene,206586828,206652117,+,.,,,ENSG00000196352.16,...,CD55,,,CD55,,,,,False,
3,chr1,CAT,exon,206586939,206587161,+,.,ENST00000391921.9,CD55-206,ENSG00000196352.16,...,CD55,,True,CD55,,,,,False,
4,chr1,CAT,exon,206586939,206587161,+,.,ENST00000645323.1,CD55-214,ENSG00000196352.16,...,CD55,,True,CD55,,,,,False,
5,chr1,CAT,transcript,206586939,206625016,+,10000,ENST00000391921.9,CD55-206,ENSG00000196352.16,...,CD55,,,CD55,,,,,False,
6,chr1,CAT,transcript,206586939,206625639,+,10000,ENST00000645323.1,CD55-214,ENSG00000196352.16,...,CD55,,,CD55,,,,,False,
7,chr1,CAT,exon,206586974,206587161,+,.,ENST00000367064.9,CD55-204,ENSG00000196352.16,...,CD55,,True,CD55,,,,,False,
8,chr1,CAT,transcript,206586974,206626269,+,10000,ENST00000367064.9,CD55-204,ENSG00000196352.16,...,CD55,,,CD55,,,,,False,
9,chr1,CAT,exon,206586997,206587161,+,.,ENST00000314754.12,CD55-201,ENSG00000196352.16,...,CD55,,True,CD55,,,,,False,
