### Database creation of Telomere-to-telomere consortium CHM13 Annotations

##### Before you begin, please make sure you have the proper files downloaded

In [1]:
#Download link for CHM13 Annotations: https://github.com/marbl/CHM13?tab=readme-ov-file
##Scroll Down to find the UCSC GENCODEv35 CAT/Liftoff v2 Annotation file, which will be used for this database

In [3]:
! python3 -m pip install gffutils

Collecting gffutils
  Downloading gffutils-0.13-py3-none-any.whl.metadata (1.5 kB)
Collecting pyfaidx>=0.5.5.2 (from gffutils)
  Downloading pyfaidx-0.8.1.3-py3-none-any.whl.metadata (25 kB)
Collecting argh>=0.26.2 (from gffutils)
  Downloading argh-0.31.3-py3-none-any.whl.metadata (7.4 kB)
Collecting argcomplete>=1.9.4 (from gffutils)
  Downloading argcomplete-3.6.1-py3-none-any.whl.metadata (16 kB)
Collecting simplejson (from gffutils)
  Downloading simplejson-3.20.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting importlib-metadata (from pyfaidx>=0.5.5.2->gffutils)
  Downloading importlib_metadata-8.6.1-py3-none-any.whl.metadata (4.7 kB)
Collecting zipp>=3.20 (from importlib-metadata->pyfaidx>=0.5.5.2->gffutils)
  Downloading zipp-3.21.0-py3-none-any.whl.metadata (3.7 kB)
Downloading gffutils-0.13-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m1

In [4]:
import pandas as pd
import numpy as np
import gffutils
import pysam

##### If this is your first time importing the gff3 database, run this command

In [None]:
#This can take a long time, so only run this once (remove the #)
# gene_annotation_gff3_chm13_v2_path ='/home/michalula/code/Ella/data/chm13.draft_v2.0.gene_annotation.gff3'

# db = gffutils.create_db(gene_annotation_gff3_chm13_v2_path,
#                          dbfn='chm13db.db', force=True, keep_order=True, 
# merge_strategy='create_unique', sort_attribute_values=True) 

# replace gff3 with your gff3 file, dbfn can be whichever name you want to store the transposed file as


##### Feature the selected database 

In [9]:
gene_annotation_DB_chm13_v2_path = '/home/michalula/code/Ella/data/chm13db.db'
db = gffutils.FeatureDB(gene_annotation_DB_chm13_v2_path, keep_order=True)
db

<gffutils.interface.FeatureDB at 0x78cf0bd1d910>

##### Select the desired range

In [10]:
slct = db.region(region=('chr1', 206560169, 206614236), completely_within=False)
slct

<generator object FeatureDB.region at 0x78cf14dee110>

##### Iterate through the region and transpose it to a dictionary

In [11]:
records = []
for gene in slct:
    records.append({
        "seqid": gene.seqid,
        "source": gene.source,
        "featuretype": gene.featuretype,
        "start": gene.start,
        "end": gene.end,
        "strand": gene.strand,
        "score": gene.score,
        "attributes": gene.attributes  # Attributes as a dictionary (will be expanded later)
    })

##### Expand out the attributes and format the dictionary as a database, dropping unnecessary columns

In [12]:
df = pd.DataFrame(records)
attributes_df = df["attributes"].apply(pd.Series)
df = pd.concat([df.drop(columns=["attributes"]), attributes_df], axis=1)


In [13]:
for col in df.columns:
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)
df.dropna(how='all', axis=1, inplace=True)
df

Unnamed: 0,seqid,source,featuretype,start,end,strand,score,source_gene_common_name,source_gene,gene_biotype,...,havana_gene,havana_transcript,transcript_id,Parent,transcript_name,rna_support,reference_support,protein_id,hgnc_id,ccdsid
0,chr1,CAT,gene,206505417,206574588,+,.,AL596218.1,ENSG00000237074.2,lncRNA,...,,,,,,,,,,
1,chr1,CAT,transcript,206505417,206574588,+,9480,AL596218.1,ENSG00000237074.2,lncRNA,...,OTTHUMG00000036254.2,OTTHUMT00000488535.1,CHM13_T0017321,CHM13_G0004546,AL596218.1-201,,,,,
2,chr1,CAT,intron,206509833,206567065,+,.,AL596218.1,ENSG00000237074.2,lncRNA,...,OTTHUMG00000036254.2,OTTHUMT00000488535.1,CHM13_T0017321,CHM13_T0017321,AL596218.1-201,,True,,,
3,chr1,CAT,transcript,206514363,206574417,+,9350,AL596218.1,ENSG00000237074.2,lncRNA,...,OTTHUMG00000036254.2,OTTHUMT00000088207.2,CHM13_T0017322,CHM13_G0004546,AL596218.1-202,,,,,
4,chr1,CAT,intron,206530333,206567065,+,.,AL596218.1,ENSG00000237074.2,lncRNA,...,OTTHUMG00000036254.2,OTTHUMT00000088207.2,CHM13_T0017322,CHM13_T0017322,AL596218.1-202,,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,chr1,CAT,intron,206605808,206624848,+,.,CD55,ENSG00000196352.16,protein_coding,...,OTTHUMG00000036255.7,OTTHUMT00000382038.2,CHM13_T0017327,CHM13_T0017327,CD55-201,,True,ENSP00000316333.8,HGNC:2665,CCDS44307.1
321,chr1,CAT,stop_codon,206605941,206605943,+,.,CD55,ENSG00000196352.16,protein_coding,...,OTTHUMG00000036255.7,OTTHUMT00000088210.2,CHM13_T0017323,CHM13_T0017323,CD55-203,,True,ENSP00000356030.2,HGNC:2665,CCDS73022.1
322,chr1,CAT,exon,206612746,206612860,+,.,CD55,ENSG00000196352.16,protein_coding,...,OTTHUMG00000036255.7,OTTHUMT00000494554.1,CHM13_T0017325,CHM13_T0017325,CD55-214,,True,ENSP00000496251.1,HGNC:2665,CCDS86046.1
323,chr1,CAT,CDS,206612746,206612860,+,.,CD55,ENSG00000196352.16,protein_coding,...,OTTHUMG00000036255.7,OTTHUMT00000494554.1,CHM13_T0017325,CHM13_T0017325,CD55-214,,True,ENSP00000496251.1,HGNC:2665,CCDS86046.1
