In [4]:
import scanpy as sc
import numpy as np
import pyranges as pr
import pandas as pd

In [5]:
def extract_canonical_transcripts(gtf_df):
    """
    Extracts unique canonical transcripts from a GTF DataFrame.
    
    Parameters:
        gtf_df (pd.DataFrame): DataFrame containing GTF file data.
        
    Returns:
        pd.DataFrame: A DataFrame with unique canonical gene-transcript pairs.
    """
    print("Starting extraction of canonical transcripts...")
    
    # filter for canonical entries
    canonical_df = gtf_df[gtf_df['tag'] == 'Ensembl_canonical']
    print(f"Filtered for canonical entries. Remaining rows: {len(canonical_df)}")
    
    # extract unique gene-transcript pairs
    canonical_transcripts = canonical_df[['gene_id', 'transcript_id', 'tag']].drop_duplicates()
    print(f"Removed duplicate gene-transcript pairs. Unique pairs: {len(canonical_transcripts)}")
    
    # count unique transcripts per gene
    transcript_counts = canonical_transcripts.groupby('gene_id')['transcript_id'].nunique()
    print(f"Counted unique transcripts per gene. Total unique genes: {len(transcript_counts)}")
    
    # check if all genes have exactly one unique transcript
    if len(transcript_counts.unique()) > 1:
        print("Canonical transcripts are not unique! Some genes have more than one transcript.")
    else:
        print("All genes have exactly one unique canonical transcript.")
    
    print("Finished extracting canonical transcripts.")
    return canonical_transcripts[['gene_id', 'transcript_id']]

In [6]:
hs_gtf_path = './Homo_sapiens.GRCh38.113.gtf'
hs_gtf_df = pr.read_gtf(hs_gtf_path).df
hs_canonical_transcripts = extract_canonical_transcripts(hs_gtf_df)

Starting extraction of canonical transcripts...
Filtered for canonical entries. Remaining rows: 718126
Removed duplicate gene-transcript pairs. Unique pairs: 78932
Counted unique transcripts per gene. Total unique genes: 78932
All genes have exactly one unique canonical transcript.
Finished extracting canonical transcripts.


In [7]:
mm_gtf_path = './Mus_musculus.GRCm39.113.gtf'
mm_gtf_df = pr.read_gtf(mm_gtf_path).df
mm_canonical_transcripts = extract_canonical_transcripts(mm_gtf_df)

Starting extraction of canonical transcripts...
Filtered for canonical entries. Remaining rows: 705235
Removed duplicate gene-transcript pairs. Unique pairs: 78298
Counted unique transcripts per gene. Total unique genes: 78298
All genes have exactly one unique canonical transcript.
Finished extracting canonical transcripts.


## add canonical transcripts to anndata objects in preparation for SAMap

In [None]:
def update_var_metadata(adata, canonical_transcripts):
    """
    Updates .var metadata by renaming columns, adding 'gene_name', and mapping 'canonical_transcript_id'.

    Parameters:
        adata (AnnData): The AnnData object to update.
        canonical_transcripts (pd.DataFrame): DataFrame with 'gene_id' and 'transcript_id' columns.

    Returns:
        AnnData: Updated AnnData object with added metadata.
    """
    # rename 'gene_ids' to 'gene_id' if necessary
    if 'gene_ids' in adata.var.columns:
        print("Renaming 'gene_ids' to 'gene_id'.")
        adata.var = adata.var.rename(columns={'gene_ids': 'gene_id'})

    # add 'gene_name' column from the index
    print("Adding 'gene_name' column from the index.")
    adata.var['gene_name'] = adata.var.index

    # map 'canonical_transcript_id' from canonical_transcripts
    print("Adding 'canonical_transcript_id' column.")
    canonical_map = canonical_transcripts.set_index('gene_id')['transcript_id']
    adata.var['canonical_transcript_id'] = adata.var['gene_id'].map(canonical_map)

    return adata

def reorder_var_columns(adata, desired_order):
    """
    Reorders the columns in .var according to the specified order and optionally removes 'original_names'.

    Parameters:
        adata (AnnData): The AnnData object to reorder.
        desired_order (list): The desired order of columns in .var.

    Returns:
        AnnData: AnnData object with reordered .var columns.
    """
    print("Reordering .var columns.")
    adata.var = adata.var[desired_order]
    
    # check if 'original_names' exists and drop it
    if 'original_names' in adata.var.columns:
        print("Dropping 'original_names' column from .var.")
        adata.var = adata.var.drop(columns=['original_names'])
    else:
        print("'original_names' column not found. Skipping drop.")
    
    return adata

desired_order = [
    'gene_name', 'gene_id', 'canonical_transcript_id', 'feature_types', 'genome', 
    'original_names', 'is_duplicated', 'mt', 'ribo', 'hb', 
    'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 
    'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
]


In [None]:
mm_h5ad_path = '20251008_Cevrim_XMens_Only_Fib_Dec_Cells.h5ad'
mm = sc.read_h5ad(mm_h5ad_path)
print(mm)

AnnData object with n_obs × n_vars = 8162 × 32285
    obs: 'cell_barcode', 'cellbender_filtered', 'cellranger_filtered', 'sample', 'mck', 'mck_full', 'species', 'line', 'zeitgeber_time', 'day_pseudopregancy', 'n_counts', 'x_um', 'y_um', 'x_um_dbscan', 'y_um_dbscan', 'pct.intronic', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'gene_ids', 'feature_types', 'genome', 'original_names', 'is_duplicated', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'


In [None]:
hs_h5ad_path = '20251008_Cevrim_Human_Only_Fib_Dec_Cells.h5ad'
hs = sc.read_h5ad(hs_h5ad_path)
print(hs)

AnnData object with n_obs × n_vars = 10451 × 36601
    obs: 'cell_barcode', 'cellbender_filtered', 'cellranger_filtered', 'sample', 'mck', 'mck_full', 'species', 'line', 'n_counts', 'x_um', 'y_um', 'x_um_dbscan', 'y_um_dbscan', 'pct.intronic', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'gene_ids', 'feature_types', 'genome', 'original_names', 'is_duplicated', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'


In [None]:
mm = update_var_metadata(mm, mm_canonical_transcripts)
hs = update_var_metadata(hs, hs_canonical_transcripts)

Renaming 'gene_ids' to 'gene_id'.
Adding 'gene_name' column from the index.
Adding 'canonical_transcript_id' column.
Renaming 'gene_ids' to 'gene_id'.
Adding 'gene_name' column from the index.
Adding 'canonical_transcript_id' column.


In [None]:
mm = reorder_var_columns(mm, desired_order)
hs = reorder_var_columns(hs, desired_order)

Reordering .var columns.
Dropping 'original_names' column from .var.
Reordering .var columns.
Dropping 'original_names' column from .var.


In [22]:
mm_gene_df = mm.var[['gene_name','gene_id','canonical_transcript_id']]

In [23]:
hs_gene_df = hs.var[['gene_name','gene_id','canonical_transcript_id']]

In [25]:
mm_gene_df.to_csv('./gene_transcript_csvs/mm_gene_names.csv', index=False)

In [33]:
mm_gene_df

Unnamed: 0,gene_name,gene_id,canonical_transcript_id
0,Xkr4,ENSMUSG00000051951,ENSMUST00000070533
1,Gm1992,ENSMUSG00000089699,ENSMUST00000161581
2,Gm19938,ENSMUSG00000102331,ENSMUST00000192692
3,Gm37381,ENSMUSG00000102343,ENSMUST00000192427
4,Rp1,ENSMUSG00000025900,ENSMUST00000027032
...,...,...,...
32280,AC124606.1,ENSMUSG00000095523,ENSMUST00000180303
32281,AC133095.2,ENSMUSG00000095475,ENSMUST00000180208
32282,AC133095.1,ENSMUSG00000094855,ENSMUST00000178327
32283,AC234645.1,ENSMUSG00000095019,ENSMUST00000178569


In [26]:
hs_gene_df.to_csv('./gene_transcript_csvs/hs_gene_names.csv', index=False)