# Extract transcript ID and gene name From GFF file

This is a simple script to extract transcript IDs and gene names from genomic GFF file GRCh38_latest_genomic.gff (obtained from https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml) to a tab-separated files and bed file

In [1]:
import pandas as pd

In [2]:
# Let's read in all the transcript IDs
alltranscripts = pd.read_csv("../data/GRCh38_latest_genomic_WithoutHeaderLines.tsv",header=None,sep="\t")
print alltranscripts.shape
alltranscripts.head()

(3695771, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NC_000001.11,RefSeq,region,1.0,248956422.0,.,+,.,ID=id0;Dbxref=taxon:9606;Name=1;chromosome=1;g...
1,NC_000001.11,BestRefSeq,pseudogene,11874.0,14409.0,.,+,.,"ID=gene0;Dbxref=GeneID:100287102,HGNC:HGNC:371..."
2,NC_000001.11,BestRefSeq,transcript,11874.0,14409.0,.,+,.,"ID=rna0;Parent=gene0;Dbxref=GeneID:100287102,G..."
3,NC_000001.11,BestRefSeq,exon,11874.0,12227.0,.,+,.,"ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Gen..."
4,NC_000001.11,BestRefSeq,exon,12613.0,12721.0,.,+,.,"ID=id2;Parent=rna0;Dbxref=GeneID:100287102,Gen..."


In [3]:
# Only grab rows that have both a gene name and a transcript ID
transcripts_valid = alltranscripts[alltranscripts[8].str.contains("gene=")&alltranscripts[8].str.contains("transcript_id=")]
print transcripts_valid.shape
# Reset index 
transcripts_valid = transcripts_valid.reset_index(drop=True)
transcripts_valid.head()

(2097281, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NC_000001.11,BestRefSeq,transcript,11874.0,14409.0,.,+,.,"ID=rna0;Parent=gene0;Dbxref=GeneID:100287102,G..."
1,NC_000001.11,BestRefSeq,exon,11874.0,12227.0,.,+,.,"ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Gen..."
2,NC_000001.11,BestRefSeq,exon,12613.0,12721.0,.,+,.,"ID=id2;Parent=rna0;Dbxref=GeneID:100287102,Gen..."
3,NC_000001.11,BestRefSeq,exon,13221.0,14409.0,.,+,.,"ID=id3;Parent=rna0;Dbxref=GeneID:100287102,Gen..."
4,NC_000001.11,BestRefSeq,transcript,14362.0,29370.0,.,-,.,"ID=rna1;Parent=gene1;Dbxref=GeneID:653635,Genb..."


In [4]:
# Create a small function to extract gene name and transcript ID for each row
def extractGeneNameAndTranscriptID(row):
    split_row = row.split(';')
    genename = [i for i in split_row if "gene=" in i][0].split('=')[1]
    transcriptID = [i for i in split_row if "transcript_id=" in i][0].split('=')[1]
    return [genename,transcriptID]

In [5]:
# Use function to extract gene name and transcript IDs
geneNamesTranscriptIDs = transcripts_valid[8].apply(extractGeneNameAndTranscriptID)

In [6]:
# Convert gene names and transcript IDs to a dataframe
geneNamesTranscriptIDs_DF = pd.DataFrame(list(geneNamesTranscriptIDs.values))
geneNamesTranscriptIDs_DF.columns = ["Gene","TranscriptID"]
geneNamesTranscriptIDs_DF.head()

Unnamed: 0,Gene,TranscriptID
0,DDX11L1,NR_046018.2
1,DDX11L1,NR_046018.2
2,DDX11L1,NR_046018.2
3,DDX11L1,NR_046018.2
4,WASH7P,NR_024540.1


In [7]:
# Drop duplicate values and write this data frame to a file
geneNamesTranscriptIDs_DF[["TranscriptID","Gene"]].drop_duplicates().to_csv("../data/GeneNameTranscriptID_NCBI_RefSeq_hg38_FromGFFfile.tsv",sep="\t",header=False,index=False)

In [8]:
# New data frame
transcripts_valid_withNames = pd.concat([transcripts_valid[[0,1,2,3,4,6]],geneNamesTranscriptIDs_DF],axis=1,ignore_index=True)
transcripts_valid_withNames.columns = ["chrm","RefSeq","feature","start","end","strand","gene","transcriptID"]
transcripts_valid_withNames.head()

Unnamed: 0,chrm,RefSeq,feature,start,end,strand,gene,transcriptID
0,NC_000001.11,BestRefSeq,transcript,11874.0,14409.0,+,DDX11L1,NR_046018.2
1,NC_000001.11,BestRefSeq,exon,11874.0,12227.0,+,DDX11L1,NR_046018.2
2,NC_000001.11,BestRefSeq,exon,12613.0,12721.0,+,DDX11L1,NR_046018.2
3,NC_000001.11,BestRefSeq,exon,13221.0,14409.0,+,DDX11L1,NR_046018.2
4,NC_000001.11,BestRefSeq,transcript,14362.0,29370.0,-,WASH7P,NR_024540.1


In [9]:
# Keep only rows that have a valid chromsome that is not a scaffold. Check if NC
transcripts_valid_withNames_validChr = transcripts_valid_withNames[transcripts_valid_withNames["chrm"].str.contains("NC")]
print transcripts_valid_withNames_validChr.shape

(2017619, 8)


In [10]:
# Dictionary to replace chromosome name 
chrms_to_replace = list(transcripts_valid_withNames_validChr["chrm"].drop_duplicates().values)
dict_chrms = {}
for i in chrms_to_replace:
    x = i.split('.')[0]
    if x[len(x)-2]=='0':
        dict_chrms[i]='chr'+x[len(x)-1]
    else:
        dict_chrms[i]='chr'+x[len(x)-2:]
print dict_chrms

{'NC_000024.10': 'chr24', 'NC_000006.12': 'chr6', 'NC_000011.10': 'chr11', 'NC_000023.11': 'chr23', 'NC_000013.11': 'chr13', 'NC_000003.12': 'chr3', 'NC_000009.12': 'chr9', 'NC_000022.11': 'chr22', 'NC_000001.11': 'chr1', 'NC_000004.12': 'chr4', 'NC_000018.10': 'chr18', 'NC_000016.10': 'chr16', 'NC_000015.10': 'chr15', 'NC_000008.11': 'chr8', 'NC_000002.12': 'chr2', 'NC_000007.14': 'chr7', 'NC_000010.11': 'chr10', 'NC_000014.9': 'chr14', 'NC_000012.12': 'chr12', 'NC_000019.10': 'chr19', 'NC_000017.11': 'chr17', 'NC_000005.10': 'chr5', 'NC_000020.11': 'chr20', 'NC_000021.9': 'chr21'}


In [11]:
# Replace chrm column with dictionary values
transcripts_valid_withNames_validChr=transcripts_valid_withNames_validChr.replace({"chrm": dict_chrms})

In [12]:
# Change start and end values to int
transcripts_valid_withNames_validChr['start'] = transcripts_valid_withNames_validChr['start'].astype('int')
transcripts_valid_withNames_validChr['end'] = transcripts_valid_withNames_validChr['end'].astype('int')
transcripts_valid_withNames_validChr.head()

Unnamed: 0,chrm,RefSeq,feature,start,end,strand,gene,transcriptID
0,chr1,BestRefSeq,transcript,11874,14409,+,DDX11L1,NR_046018.2
1,chr1,BestRefSeq,exon,11874,12227,+,DDX11L1,NR_046018.2
2,chr1,BestRefSeq,exon,12613,12721,+,DDX11L1,NR_046018.2
3,chr1,BestRefSeq,exon,13221,14409,+,DDX11L1,NR_046018.2
4,chr1,BestRefSeq,transcript,14362,29370,-,WASH7P,NR_024540.1


In [13]:
# Sort by chrm and start and end
transcripts_valid_withNames_validChr_sorted = transcripts_valid_withNames_validChr.sort_values(by=["chrm","start","end"])

In [14]:
# Write some of the columns to a file
transcripts_valid_withNames_validChr_sorted[["chrm","start","end","strand","gene","transcriptID","RefSeq","feature"]].to_csv("../data/GRCh38_latest_genomic_ValidChroms_OnlyGenes.bed",sep="\t",header=False,index=False)

In [15]:
# Let's get transcript IDs that exons 
transcripts_exons = transcripts_valid_withNames_validChr_sorted[transcripts_valid_withNames_validChr_sorted["feature"]=="exon"]
print transcripts_exons.shape
transcripts_exons.head()

(1859728, 8)


Unnamed: 0,chrm,RefSeq,feature,start,end,strand,gene,transcriptID
1,chr1,BestRefSeq,exon,11874,12227,+,DDX11L1,NR_046018.2
2,chr1,BestRefSeq,exon,12613,12721,+,DDX11L1,NR_046018.2
3,chr1,BestRefSeq,exon,13221,14409,+,DDX11L1,NR_046018.2
15,chr1,BestRefSeq,exon,14362,14829,-,WASH7P,NR_024540.1
14,chr1,BestRefSeq,exon,14970,15038,-,WASH7P,NR_024540.1


In [16]:
# Group by transcript ID to check how many exons are available for the transcript
transcripts_exons_groupedById = transcripts_exons.groupby(by=["transcriptID"]).agg('count')
print transcripts_exons_groupedById.shape
transcripts_exons_groupedById.head()

(157829, 7)


Unnamed: 0_level_0,chrm,RefSeq,feature,start,end,strand,gene
transcriptID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NM_000014.5,36,36,36,36,36,36,36
NM_000015.2,2,2,2,2,2,2,2
NM_000016.5,12,12,12,12,12,12,12
NM_000017.3,10,10,10,10,10,10,10
NM_000018.3,20,20,20,20,20,20,20


In [17]:
# Only grab transcript IDs of those that have greater than 1 exon
transcripts_exons_groupedById_MoreThan1Exon = transcripts_exons_groupedById[transcripts_exons_groupedById["chrm"]>1]
print transcripts_exons_groupedById_MoreThan1Exon.shape
transcripts_exons_groupedById_MoreThan1Exon.head()

(153118, 7)


Unnamed: 0_level_0,chrm,RefSeq,feature,start,end,strand,gene
transcriptID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NM_000014.5,36,36,36,36,36,36,36
NM_000015.2,2,2,2,2,2,2,2
NM_000016.5,12,12,12,12,12,12,12
NM_000017.3,10,10,10,10,10,10,10
NM_000018.3,20,20,20,20,20,20,20


In [18]:
# Grab the transcript IDs of transcripts with more than 1 exon
transcriptIDs_MoreThan1Exon = list(set(list(transcripts_exons_groupedById_MoreThan1Exon.index.values)))
len(transcriptIDs_MoreThan1Exon)

153118

In [19]:
# Get corresponding gene names
transcriptIDsGeneName_MoreThan1Exon= geneNamesTranscriptIDs_DF[geneNamesTranscriptIDs_DF["TranscriptID"].isin(transcriptIDs_MoreThan1Exon)]
print transcriptIDsGeneName_MoreThan1Exon.drop_duplicates().shape
transcriptIDsGeneName_MoreThan1Exon.drop_duplicates().head()

(153118, 2)


Unnamed: 0,Gene,TranscriptID
0,DDX11L1,NR_046018.2
4,WASH7P,NR_024540.1
18,MIR1302-2HG,XR_001737835.1
24,FAM138A,NR_026818.1
30,LOC100996442,XR_001737582.2


In [20]:
# Write data frame to a file
transcriptIDsGeneName_MoreThan1Exon.drop_duplicates().to_csv("../data/GeneNameTranscriptID_NCBI_RefSeq_hg38_FromGFFfile_OnlyOnesWithIntrons.tsv",sep="\t",header=False,index=False)