In [1]:
from Bio import SeqIO

In [5]:
for record in SeqIO.parse("sequence.fasta", "fasta"):
    print(record)

ID: MN908947.3
Name: MN908947.3
Description: MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Number of features: 0
Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')


In [6]:
ncov_record = SeqIO.read("sequence.fasta", "fasta")

In [7]:
ncov_record

SeqRecord(seq=Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA'), id='MN908947.3', name='MN908947.3', description='MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome', dbxrefs=[])

In [9]:
ncov_dna = ncov_record.seq
ncov_dna

Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')

In [10]:
# Len of COVID dna
len(ncov_dna)

29903

In [11]:
# Protein Synthesis
# DNA => mRNA => Protein/AA
ncov_mrna = ncov_dna.transcribe()
ncov_mrna

Seq('AUUAAAGGUUUAUACCUUCCCAGGUAACAAACCAACCAACUUUCGAUCUCUUGU...AAA')

In [12]:
# Translate to Amino Acids/Protein
ncov_protein = ncov_mrna.translate()
ncov_protein



Seq('IKGLYLPR*QTNQLSISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QD...KKK')

In [13]:
len(ncov_protein)

9967

In [14]:
# custom codon = 3
len(ncov_dna)/3

9967.666666666666

In [20]:
# Find all AA seq before stopcodon
ncov_aa = ncov_protein.split("*")
# ncov_aa

In [22]:
ncov_clean = [str(i) for i in ncov_aa]
len(ncov_clean)

775

In [23]:
# Using Pandas
import  pandas as pd

In [24]:
df = pd.DataFrame({"amino_acids":ncov_clean})

In [25]:
df["count"] = df["amino_acids"].str.len()
df.head()

Unnamed: 0,amino_acids,count
0,IKGLYLPR,8
1,QTNQLSISCRSVL,13
2,TNFKICVAVTRLHA,14
3,CTHAV,5
4,LITNYCR,7


In [28]:
df.nlargest(10, "count")

Unnamed: 0,amino_acids,count
548,CTIVFKRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFAKFL...,2701
694,ASAQRSQITLHINELMDLFMRIFTIGTVTLKQGEIKDATPSDFVRA...,290
719,TNMKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNS...,123
695,AQADEYELMYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALR...,83
718,QQMFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSL...,63
6,DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS,46
464,TMLRCYFPKCSEKNNQGYTPLVVTHNFDFTFSFSPEYSMVFVLFFV,46
539,DVVYTHWYWSGNNSYTGSQYGSRILWWCIVLSVLPLPHRSSKS,43
758,LQTLAANCTICPQRFSVLRNVAHWHGSHTFGNVVDLHRCHQIG,43
771,KSHHIFTEATRSTIECTVNNARESCLYGRALMCKINFSSAIPM,43


In [29]:
# Count the Frequence of Amino Acid
from collections import Counter 

In [30]:
Counter(ncov_protein).most_common(10)

[('L', 886),
 ('S', 810),
 ('*', 774),
 ('T', 679),
 ('C', 635),
 ('F', 593),
 ('R', 558),
 ('V', 548),
 ('Y', 505),
 ('N', 472)]

In [31]:
# Import our Parser
from Bio.PDB import PDBParser

In [32]:
# Read our PDB File
parser = PDBParser()
structure = parser.get_structure("mmdb_6LU7","mmdb_6LU7.pdb")



In [34]:
len(structure)

4

In [35]:
model = structure[0]

In [36]:
for chain in model:
    print(chain)

<Chain id=A>


In [37]:
# 3D visualization
import nglview as nv



In [42]:
view = nv.show_biopython(structure)
view

NGLWidget()