In [2]:
# Import packages 
import pandas as pd
import numpy as np
import re


# Import transcriptome
df = pd.read_csv('../B_transcriptome/cc_transcriptome_all.gtf', sep='\t', comment='#', header=None) 
# rename columns of the dataframe
df.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
# keep only the exon lines
df = df.loc[df['feature'] == 'exon']
# pattern line as regular expression to explode the attribute column into many individual columns
pattern = r'\s*gene_id\s+"([^"]+)";\s+transcript_id\s+"([^"]+)";\s+exon_number\s+"([^"]+)";(?:\s+reference_id\s+"([^"]+)";)?(?:\s+ref_gene_id\s+"([^"]+)";)?\s+cov\s+"([^"]+)";'
# extract the columns from the attribute column
re.compile(pattern)
df_new_col = df['attribute'].str.split(pat=re.compile(pattern), regex=True, expand=True)
df[['empty_start','gene_id', 'transcript_id', 'exon_number', 'reference_id', 'ref_gene_id', 'cov', 'empty_end']] = df_new_col
# drop empty and unnecessary columns
df.drop(columns=['empty_start', 'empty_end', 'attribute'], inplace=True)
# calculate length of the exons
df['length'] = df['end'] - df['start'] 
# calculate the total exon lenth of the individual transcripts
exon_total_length = df.groupby('transcript_id')['length'].sum()
exon_total_length.to_frame()
# export the dataframe to a tsv file
exon_total_length.to_csv('../B_transcriptome/transcriptome_transcript_lengths.tsv', sep='\t', header=True)

In [1]:
# Import packages 
import pandas as pd
import numpy as np
import re

# Import annotation file
df = pd.read_csv('../A_annotation/carcar_annotation_v5.gtf', sep='\t', comment='#', header=None) 
# rename columns of the dataframe
df.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
# keep only the exon lines
df = df.loc[df['feature'] == 'exon']
# pattern line as regular expression to explode the attribute column into many individual columns
pattern = r'\s*gene_id\s+"([^"]+)";\s+transcript_id\s+"([^"]+)";\s+ID\s+"([^"]+)";\s+Parent\s+"([^"]+)";'
# extract the columns from the attribute column
re.compile(pattern)
df_new_col = df['attribute'].str.split(pat=re.compile(pattern), regex=True, expand=True)
df[['empty_start','gene_id', 'transcript_id', 'ID', 'Parent', 'empty_end']] = df_new_col
# drop empty and unnecessary columns
df.drop(columns=['empty_start', 'empty_end', 'attribute'], inplace=True)
# calculate length of the exons
df['length'] = df['end'] - df['start'] 
# calculate the total exon lenth of the individual transcripts
exon_total_length = df.groupby('transcript_id')['length'].sum()
exon_total_length.to_frame()
# export the dataframe to a tsv file
exon_total_length.to_csv('../A_annotation/carcar_annotation_v5_transcript_lengths.tsv', sep='\t', header=True)