In [1]:
import pandas as pd

In [2]:
col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 
            'phase', 'attributes']
df = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip', 
                sep='\t', comment='#', low_memory=False,
                header=None, names=col_names)

In [3]:
df.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,1,GRCh38,chromosome,1,248956422,.,.,.,"ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000..."
1,1,.,biological_region,10469,11240,1.3e+03,.,.,external_name=oe %3D 0.79;logic_name=cpg
2,1,.,biological_region,10650,10657,0.999,+,.,logic_name=eponine
3,1,.,biological_region,10655,10657,0.999,-,.,logic_name=eponine
4,1,.,biological_region,10678,10687,0.999,+,.,logic_name=eponine


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2601849 entries, 0 to 2601848
Data columns (total 9 columns):
seqid         object
source        object
type          object
start         int64
end           int64
score         object
strand        object
phase         object
attributes    object
dtypes: int64(2), object(7)
memory usage: 178.7+ MB


In [5]:
# How many transcripts does a gene typically have? What percentage of genes have more than 1 transcript?
# How many exons, CDS, and UTRs does a transcript typically have? What sizes are they?

In [6]:
gdf = df[df['source'] == 'GRCh38']
gdf.shape

(194, 9)

In [9]:
edf = df[df['source'].isin(['ensembl', 'havana', 'ensembl_havana'])]
edf.sample(10)

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
1394083,2,ensembl_havana,exon,38781517,38785000,.,+,.,Parent=transcript:ENST00000281950;Name=ENSE000...
1131521,17,havana,CDS,75763014,75763149,.,-,2,ID=CDS:ENSP00000225614;Parent=transcript:ENST0...
335228,11,havana,exon,893427,893519,.,-,.,Parent=transcript:ENST00000526714;Name=ENSE000...
1811626,3,havana,three_prime_UTR,130999518,130999659,.,+,.,Parent=transcript:ENST00000507194
795321,15,ensembl_havana,CDS,43605264,43605399,.,-,1,ID=CDS:ENSP00000401513;Parent=transcript:ENST0...
2393736,8,ensembl,CDS,120371152,120371231,.,+,2,ID=CDS:ENSP00000247781;Parent=transcript:ENST0...
1840357,3,havana,exon,178795142,178795218,.,+,.,Parent=transcript:ENST00000420517;Name=ENSE000...
1704140,22,havana,exon,50519190,50519320,.,+,.,Parent=transcript:ENST00000420993;Name=ENSE000...
2000138,5,ensembl,exon,73780676,73780745,.,+,.,Parent=transcript:ENST00000545377;Name=ENSE000...
617750,12,ensembl_havana,aberrant_processed_transcript,120996390,120997882,.,+,.,ID=transcript:ENST00000543255;Parent=gene:ENSG...


In [12]:
ndf = edf[edf['type'] == 'transcript']
ndf = ndf.copy()
ndf.sample(10)['attributes'].values

array(['ID=transcript:ENST00000374788;Parent=gene:ENSG00000148498;Name=PARD3-003;biotype=protein_coding;ccdsid=CCDS53515.1;havana_transcript=OTTHUMT00000047528;havana_version=1;tag=basic;transcript_id=ENST00000374788;transcript_support_level=1;version=7',
       'ID=transcript:ENST00000396595;Parent=gene:ENSG00000064042;Name=LIMCH1-017;biotype=protein_coding;ccdsid=CCDS47047.1;havana_transcript=OTTHUMT00000361478;havana_version=2;tag=basic;transcript_id=ENST00000396595;transcript_support_level=1;version=7',
       'ID=transcript:ENST00000366965;Parent=gene:ENSG00000143494;Name=VASH2-005;biotype=protein_coding;ccdsid=CCDS1511.1;havana_transcript=OTTHUMT00000089686;havana_version=3;tag=basic;transcript_id=ENST00000366965;transcript_support_level=1;version=6',
       'ID=transcript:ENST00000397912;Parent=gene:ENSG00000128563;Name=PRKRIP1-001;biotype=protein_coding;ccdsid=CCDS34714.1;havana_transcript=OTTHUMT00000349483;havana_version=1;tag=basic;transcript_id=ENST00000397912;transcript_su

In [13]:
import re

In [24]:
RE_transcript_ID = re.compile(r'transcript_id=(?P<transcript_name>.+?);')
def extract_transcript_name(attributes_str):
    res = RE_transcript_ID.search(attributes_str)
    return res.group('transcript_name')

ndf['transcript_id'] = ndf['attributes'].apply(extract_transcript_name)

In [25]:
ndf['transcript_id']

78         ENST00000335137
122        ENST00000493797
126        ENST00000484859
129        ENST00000490997
167        ENST00000624431
176        ENST00000623834
198        ENST00000623083
222        ENST00000624735
379        ENST00000426406
492        ENST00000332831
532        ENST00000422528
824        ENST00000420190
840        ENST00000437963
852        ENST00000342066
883        ENST00000618181
908        ENST00000622503
939        ENST00000618323
966        ENST00000616016
995        ENST00000618779
1024       ENST00000616125
1051       ENST00000620200
1072       ENST00000617307
1101       ENST00000341065
1127       ENST00000455979
1241       ENST00000327044
1331       ENST00000338591
1358       ENST00000622660
1415       ENST00000379410
1450       ENST00000379409
1483       ENST00000379407
                ...       
2600333    ENST00000382424
2600384    ENST00000382431
2600429    ENST00000382440
2600488    ENST00000382433
2600718    ENST00000306882
2600725    ENST00000382407
2

In [23]:
RE_GENE_ID = re.compile(r'Parent=gene:(?P<gene_id>ENSG.+?);')
def extract_gene_id(attributes_str):
    res = RE_GENE_ID.search(attributes_str)
    return res.group('gene_id')

In [26]:
ndf['gene_id'] = ndf['attributes'].apply(extract_gene_id)

In [27]:
ndf['gene_id']

78         ENSG00000186092
122        ENSG00000239906
126        ENSG00000241860
129        ENSG00000241860
167        ENSG00000279928
176        ENSG00000279457
198        ENSG00000279457
222        ENSG00000279457
379        ENSG00000278566
492        ENSG00000273547
532        ENSG00000229905
824        ENSG00000187634
840        ENSG00000187634
852        ENSG00000187634
883        ENSG00000187634
908        ENSG00000187634
939        ENSG00000187634
966        ENSG00000187634
995        ENSG00000187634
1024       ENSG00000187634
1051       ENSG00000187634
1072       ENSG00000187634
1101       ENSG00000187634
1127       ENSG00000187634
1241       ENSG00000188976
1331       ENSG00000187961
1358       ENSG00000187961
1415       ENSG00000187583
1450       ENSG00000187583
1483       ENSG00000187583
                ...       
2600333    ENSG00000205944
2600384    ENSG00000205944
2600429    ENSG00000205944
2600488    ENSG00000205944
2600718    ENSG00000172352
2600725    ENSG00000172352
2

In [28]:
ndf.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transcript_name,transcript_id,gene_id
78,1,ensembl_havana,transcript,69091,70008,.,+,.,ID=transcript:ENST00000335137;Parent=gene:ENSG...,ENST00000335137,ENST00000335137,ENSG00000186092
122,1,havana,transcript,139790,140339,.,-,.,ID=transcript:ENST00000493797;Parent=gene:ENSG...,ENST00000493797,ENST00000493797,ENSG00000239906
126,1,havana,transcript,141474,149707,.,-,.,ID=transcript:ENST00000484859;Parent=gene:ENSG...,ENST00000484859,ENST00000484859,ENSG00000241860
129,1,havana,transcript,142808,146831,.,-,.,ID=transcript:ENST00000490997;Parent=gene:ENSG...,ENST00000490997,ENST00000490997,ENSG00000241860
167,1,ensembl,transcript,182393,184158,.,+,.,ID=transcript:ENST00000624431;Parent=gene:ENSG...,ENST00000624431,ENST00000624431,ENSG00000279928


In [29]:
ndf.drop('attributes', axis=1, inplace=True)

In [30]:
ndf.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,transcript_name,transcript_id,gene_id
78,1,ensembl_havana,transcript,69091,70008,.,+,.,ENST00000335137,ENST00000335137,ENSG00000186092
122,1,havana,transcript,139790,140339,.,-,.,ENST00000493797,ENST00000493797,ENSG00000239906
126,1,havana,transcript,141474,149707,.,-,.,ENST00000484859,ENST00000484859,ENSG00000241860
129,1,havana,transcript,142808,146831,.,-,.,ENST00000490997,ENST00000490997,ENSG00000241860
167,1,ensembl,transcript,182393,184158,.,+,.,ENST00000624431,ENST00000624431,ENSG00000279928


In [36]:
count_df = ndf.groupby('gene_id').count().loc[:, 'transcript_id']

In [42]:
pd.DataFrame(count_df)['transcript_id'].describe()

count    30400.00000
mean         3.17023
std          3.64185
min          1.00000
25%          1.00000
50%          2.00000
75%          4.00000
max        167.00000
Name: transcript_id, dtype: float64