In [1]:
import math
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
from Bio import SeqIO # To parse a FASTA file
# this dictionary is used to know which codons encode the same AA.
SynonymousCodons = {'CYS': ['TGT', 'TGC'], 'ASP': ['GAT', 'GAC'],
'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
'GLN': ['CAA', 'CAG'], 'MET': ['ATG'], 'ASN': ['AAC', 'AAT'],
'PRO': ['CCT', 'CCG', 'CCA', 'CCC'], 'LYS': ['AAG', 'AAA'],
'STOP': ['TAG', 'TGA', 'TAA'], 'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
'PHE': ['TTT', 'TTC'], 'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
'GLY': ['GGT', 'GGG', 'GGA', 'GGC'], 'ILE': ['ATC', 'ATA', 'ATT'],
'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'], 'HIS': ['CAT', 'CAC'],
'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'], 'TRP': ['TGG'],
'VAL': ['GTA', 'GTC', 'GTG', 'GTT'], 'GLU': ['GAG', 'GAA'], 'TYR': ['TAT', 'TAC']}

gencode = { 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
            'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
            'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
            'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
            'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
            'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
            'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
            'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
            'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
            'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
            'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
            'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
            'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
            'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}


In [3]:
!cp /scratch/research/projects/hsapiens/mutability/analysis/Aqsa/Human_exon_data/Homo_sapiens.GRCh38.cds.all.fa ./
!ls

Homo_sapiens.GRCh38.cds.all.fa	Untitled.ipynb


In [2]:
!ls

Homo_sapiens.GRCh38.cds.all.fa	   Human.RSCU.ipynb
Homo_sapiens.GRCh38.cds.intact.fa


# Filter the CDS file

The script requires that CDSs be in +1 frame. To ensure this is true I have eliminated a bunch of stuff.

Note that there are a crazy number of CDSs

In [3]:
!grep ">" Homo_sapiens.GRCh38.cds.all.fa |wc -l

111060


111,000 CDSs is a lot more than the expected 20-30000 genes in the human genome!

So here I take that file and ask:

 - Is it from a protein coding gene (gene_biotype == 'protein_coding'
 - I only include each gene ID once to avoid double counting
 - I demand the length by evenly divisible by 3
 - I demand it starts with a start codon
 - I demand it ends with a stop codon
 - I demand it has no ambigous bases (`N`s)
 
I then write these sequences to a file with "intact" in the file name 

In [13]:
from Bio import SeqIO
cdsfile = 'Homo_sapiens.GRCh38.cds.all.fa'
intact_genes = []
used_genes = []
for gene in SeqIO.parse(cdsfile, 'fasta'):
    gene_id = gene.description.split('gene:')[-1].split()[0]
    gene_biotype = gene.description.split('gene_biotype:')[-1].split()[0]
    if gene_biotype != 'protein_coding': continue
    if gene_id in used_genes: continue
    if len(gene.seq) %3 ==0  \
    and str(gene.seq[:3]) ==  "ATG" \
    and str(gene.seq[-3:]) in "TAA TAG TGA" \
    and 'N' not in str(gene.seq):
        intact_genes.append(gene)
        used_genes.append(gene_id)
    
print(len(intact_genes))
SeqIO.write(intact_genes, open("Homo_sapiens.GRCh38.cds.intact.fa", 'w'), 'fasta')
    

22185


22185

# Calculate CAI index
The codon adaptation index (CAI) object includes the calculation of relative synonymous codon usage (RSCU) 

 - RSCU determines the most commonly used codon for a given AA    
 - It then reports all other codons for that AA as a fraction of the most common
 - Therefore each AA has one codon with a value of 1.0 and all others must range from 0-1 (1 in the case of a tie)
 
 
 ### Caveat: This is codon usage bias across all genes.
  - Some people measure the "preferred codon" in other ways.
         - eg. The preferred codon has been described as a codon where the usage is much higher in high expression compared to low expression genes. The logic being that high expression genes are under the most pressure to be efficiently transcribed and translated and should therefore use only the best codons on offer
         
         
  There may be other PUBLISHED codon use indices we can check:
  
e.g. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5581930/   

    and its associated DB 
        https://hive.biochemistry.gwu.edu/dna.cgi?cmd=tissue_codon_usage&id=586358&mode=cocoputs

In [6]:
cai = CodonAdaptationIndex()
cai.generate_index("Homo_sapiens.GRCh38.cds.intact.fa")

In [17]:
for aa in SynonymousCodons:
    print(aa)
    for codon in SynonymousCodons[aa]:
        print("\t", codon, "\t", round(cai.index[codon],2))
    print("\n")

ILE
	 ATC 	 1.0
	 ATA 	 0.36
	 ATT 	 0.77


TYR
	 TAT 	 0.8
	 TAC 	 1.0


PHE
	 TTT 	 0.84
	 TTC 	 1.0


GLU
	 GAG 	 1.0
	 GAA 	 0.73


ASP
	 GAT 	 0.86
	 GAC 	 1.0


ARG
	 CGA 	 0.51
	 CGC 	 0.88
	 CGG 	 0.97
	 CGT 	 0.38
	 AGG 	 1.0
	 AGA 	 1.0


SER
	 TCT 	 0.77
	 TCG 	 0.23
	 TCA 	 0.63
	 TCC 	 0.9
	 AGC 	 1.0
	 AGT 	 0.62


GLY
	 GGT 	 0.47
	 GGG 	 0.75
	 GGA 	 0.73
	 GGC 	 1.0


GLN
	 CAA 	 0.36
	 CAG 	 1.0


STOP
	 TAG 	 0.43
	 TGA 	 1.0
	 TAA 	 0.53


CYS
	 TGT 	 0.84
	 TGC 	 1.0


ALA
	 GCA 	 0.56
	 GCC 	 1.0
	 GCG 	 0.27
	 GCT 	 0.64


LYS
	 AAG 	 1.0
	 AAA 	 0.77


MET
	 ATG 	 1.0


TRP
	 TGG 	 1.0


VAL
	 GTA 	 0.25
	 GTC 	 0.51
	 GTG 	 1.0
	 GTT 	 0.39


THR
	 ACC 	 1.0
	 ACA 	 0.81
	 ACG 	 0.32
	 ACT 	 0.7


HIS
	 CAT 	 0.72
	 CAC 	 1.0


PRO
	 CCT 	 0.87
	 CCG 	 0.36
	 CCA 	 0.83
	 CCC 	 1.0


ASN
	 AAC 	 1.0
	 AAT 	 0.89


LEU
	 TTA 	 0.19
	 TTG 	 0.32
	 CTC 	 0.49
	 CTT 	 0.33
	 CTG 	 1.0
	 CTA 	 0.18




In [10]:
codon_usage_dict = {}
for aa,codon_val in SynonymousCodons.items(): 
    for codon in codon_val: 
        codon_usage_dict[codon] =  [aa,round(cai.index[codon],2)]

In [12]:
import pandas as pd

In [17]:
codon_usage_df = pd.DataFrame.from_dict(codon_usage_dict,orient="index")


In [29]:
codon_usage_df = codon_usage_df.rename(columns={0: 'AA',1:"codon_usage"})#rename cols https://note.nkmk.me/en/python-pandas-dataframe-rename/

In [31]:
codon_usage_df.to_csv("codon_usage_df_2022_01_02.csv")