# Protein Analysis of Covid 19 using Biopython

<b>Objective:</b>
   - In this notebook I will be exploring the DNA sequence of Covid19 using Biopython a powerful bioinformatics package.
   - I will do a simple protein synthesis of Coronavirus with Python.

<b>Sample DNA Sequence</b>
   - Iam using the sample DNA sequence in this to perform sequence analysis, Transcription and Translation.

In [1]:
#Load the package
import Bio

import warnings
warnings. filterwarnings('ignore')

In [2]:
#Check the attributes
dir(Bio)

 'MissingExternalDependencyError',
 'MissingPythonDependencyError',
 'StreamModeError',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_parent_dir',
 'os',

In [3]:
#Working with Sequence
from Bio.Seq import Seq

In [4]:
dir(Seq)

['__abstractmethods__',
 '__add__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_data',
 'back_transcribe',
 'complement',
 'complement_rna',
 'count',
 'count_overlap',
 'encode',
 'endswith',
 'find',
 'index',
 'join',
 'lower',
 'lstrip',
 'replace',
 'reverse_complement',
 'reverse_complement_rna',
 'rfind',
 'rindex',
 'rsplit',
 'rstrip',
 'split',
 'startswith',
 'strip',
 'tomutable',
 'transcribe',
 'translate',
 'ungap',
 'upper']

In [5]:
#Create a DNA sequence
dna = Seq('ATGATCTCGTAA')
dna

Seq('ATGATCTCGTAA')

In [6]:
type(dna)

Bio.Seq.Seq

In [7]:
print(dna)

ATGATCTCGTAA


<b>Sequence Manipulation</b>
   - Indexing/Slicing
   - Join 2 Sequences
   - Count the number of Nucleotides

In [8]:
#Slicing
dna[0:3]

Seq('ATG')

In [9]:
#Adding Sequence
Seq2 = Seq('ATTGTCTCGTAA')
           

In [10]:
dna[0:3] + Seq2

Seq('ATGATTGTCTCGTAA')

In [11]:
dna

Seq('ATGATCTCGTAA')

In [12]:
#Find the number of T nucleotides in a sequence
dna.count('T')

4

In [13]:
#Find a codon(3 nucleotide)
dna.count('TC')

2

In [14]:
dna.count('ATG')

1

In [15]:
#length
len(dna)

12

In [16]:
#Find the index/position of C nucleotide in a sequence
dna.find('C')

5

In [17]:
#complement
dna

Seq('ATGATCTCGTAA')

In [18]:
#A-T, G-C
dna.complement()

Seq('TACTAGAGCATT')

In [19]:
#reverse complement
dna.reverse_complement()

Seq('TTACGAGATCAT')

In [20]:
dna

Seq('ATGATCTCGTAA')

In [21]:
#Transcribe DNA to mRNA
dna.transcribe()

Seq('AUGAUCUCGUAA')

In [22]:
mRNA = dna.transcribe()

In [23]:
#Translate mRNA to Protein
mRNA.translate()

Seq('MIS*')

# Analysis of Covid 19</b>

<b>Task</b>
   - Analysis of Covid 19 genome
   - Sequence analysis(DNA,RNA)
   - Transcription and Translation of DNA(Protein synthesis)
   - 3D structure analysis

<b>Codons</b>
   - A Adenine
   - C Cytosine
   - G Guanine
   - T Thymine
   - U Uracil * RNA
  

In [24]:
from Bio import SeqIO

In [25]:
#Load the file
for record in SeqIO.parse(r"C:\Users\karthik\Desktop\shero\sequence.fasta","fasta"):
    print(record.id)

MN908947.3


In [26]:
#Load the file
for record in SeqIO.parse(r"C:\Users\karthik\Desktop\shero\sequence.fasta","fasta"):
    print(record)

ID: MN908947.3
Name: MN908947.3
Description: MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Number of features: 0
Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')


In [27]:
# Read the sequence record in the file
ncov_record = SeqIO.read(r"C:\Users\karthik\Desktop\shero\sequence.fasta","fasta")

In [28]:
ncov_record

SeqRecord(seq=Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA'), id='MN908947.3', name='MN908947.3', description='MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome', dbxrefs=[])

In [29]:
ncov_dna = ncov_record.seq

In [30]:
#Display the nucleotides
ncov_dna

Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')

In [31]:
#Length of the sequence
len(ncov_dna)

29903

In [32]:
#Protein synthesis
#DNA to mRNA to Protein
ncov_dna

Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')

In [33]:
#Transcription DNA to mrna
ncov_mrna = ncov_dna.transcribe()

In [34]:
# Change Thymine to Uracil
ncov_mrna

Seq('AUUAAAGGUUUAUACCUUCCCAGGUAACAAACCAACCAACUUUCGAUCUCUUGU...AAA')

In [35]:
#Translation to Amino acid/Protein
ncov_protein = ncov_mrna.translate()

In [36]:
ncov_protein

Seq('IKGLYLPR*QTNQLSISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QD...KKK')

In [37]:
#Length of Protein/Amino acids
len(ncov_protein)

9967

In [38]:
# Find all the amino acids
ncov_aa = ncov_protein.split("*")

In [39]:
ncov_aa

[Seq('IKGLYLPR'),
 Seq('QTNQLSISCRSVL'),
 Seq('TNFKICVAVTRLHA'),
 Seq('CTHAV'),
 Seq('LITNYCR'),
 Seq('QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER'),
 Seq('DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS'),
 Seq('RWHLWLSRS'),
 Seq('KRRFAST'),
 Seq('TALCVHQTFGCSNCTSWSCYG'),
 Seq('AGSRTRRHSVRS'),
 Seq('W'),
 Seq('DTWCPCPSCGRNTSGLPQGSSS'),
 Seq('ER'),
 Seq(''),
 Seq('RSWWP'),
 Seq('LRRRSKVI'),
 Seq('LRRRAWH'),
 Seq('SL'),
 Seq('RFSRKLEH'),
 Seq('T'),
 Seq('QWCYP'),
 Seq('THA'),
 Seq('A'),
 Seq('RRGIHSLCR'),
 Seq('QLLWP'),
 Seq('WLPS'),
 Seq('VH'),
 Seq('RPSSTCW'),
 Seq('SFMHFVRTTGLY'),
 Seq('H'),
 Seq('EGCILLP'),
 Seq('T'),
 Seq('A'),
 Seq('NCLVHGTF'),
 Seq('KEL'),
 Seq('IADTF'),
 Seq('N'),
 Seq('IGKEI'),
 Seq('HLQWGMSKFCISLKFHNQDYSTKG'),
 Seq('KEKA'),
 Seq('WLYG'),
 Seq('NSICLSSCVTK'),
 Seq('MQPNVPFNSHEV'),
 Seq('SLW'),
 Seq('NFMADGRFC'),
 Seq('SHLRILWH'),
 Seq('EFD'),
 Seq('RRCHYLWLLTPKCCC'),
 Seq('NLLSSMSQFRSRT'),
 Seq('A'),
 Seq('SCRIP'),
 Seq(''),
 Seq('IWLENHSS'),
 Seq('GWSHYCLWRLCVLLCWLP'),

In [40]:
ncov_clean = [str(i) for i in ncov_aa]

In [41]:
ncov_clean

['IKGLYLPR',
 'QTNQLSISCRSVL',
 'TNFKICVAVTRLHA',
 'CTHAV',
 'LITNYCR',
 'QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER',
 'DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS',
 'RWHLWLSRS',
 'KRRFAST',
 'TALCVHQTFGCSNCTSWSCYG',
 'AGSRTRRHSVRS',
 'W',
 'DTWCPCPSCGRNTSGLPQGSSS',
 'ER',
 '',
 'RSWWP',
 'LRRRSKVI',
 'LRRRAWH',
 'SL',
 'RFSRKLEH',
 'T',
 'QWCYP',
 'THA',
 'A',
 'RRGIHSLCR',
 'QLLWP',
 'WLPS',
 'VH',
 'RPSSTCW',
 'SFMHFVRTTGLY',
 'H',
 'EGCILLP',
 'T',
 'A',
 'NCLVHGTF',
 'KEL',
 'IADTF',
 'N',
 'IGKEI',
 'HLQWGMSKFCISLKFHNQDYSTKG',
 'KEKA',
 'WLYG',
 'NSICLSSCVTK',
 'MQPNVPFNSHEV',
 'SLW',
 'NFMADGRFC',
 'SHLRILWH',
 'EFD',
 'RRCHYLWLLTPKCCC',
 'NLLSSMSQFRSRT',
 'A',
 'SCRIP',
 '',
 'IWLENHSS',
 'GWSHYCLWRLCVLLCWLP',
 'QVCLLGSTC',
 'R',
 'HRL',
 'PYRCCWRRFRRS',
 '',
 'QPS',
 'NTPKRESQHQYCW',
 'L',
 'T',
 '',
 'RDRHYFGIFFCFHKCFCGNCERFGL',
 'SIQTNC',
 'ILW',
 'F',
 'SYKRKS',
 'KRCLEYW',
 'TEINTESSLCICIRGCSCCTINFLPHS',
 'NCSKFCACFTEGRYNNTRWNFTVFTETH',
 'CYDVHI',
 'FGY',
 'QSSCNGLHYRWCCSVDF

In [42]:
# Place the Amino Acids into a DataFrame
import pandas as pd


In [43]:
df = pd.DataFrame({"amino_acids":ncov_clean})
df

Unnamed: 0,amino_acids
0,IKGLYLPR
1,QTNQLSISCRSVL
2,TNFKICVAVTRLHA
3,CTHAV
4,LITNYCR
...,...
770,SHIAIFNQCVTLGRT
771,KSHHIFTEATRSTIECTVNNARESCLYGRALMCKINFSSAIPM
772,F
773,


In [44]:
df['count'] = df['amino_acids'].str.len()

In [45]:
df.head()

Unnamed: 0,amino_acids,count
0,IKGLYLPR,8
1,QTNQLSISCRSVL,13
2,TNFKICVAVTRLHA,14
3,CTHAV,5
4,LITNYCR,7


In [46]:
#Find the largest sequence before *
df.nlargest(10,"count")

Unnamed: 0,amino_acids,count
548,CTIVFKRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFAKFL...,2701
694,ASAQRSQITLHINELMDLFMRIFTIGTVTLKQGEIKDATPSDFVRA...,290
719,TNMKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNS...,123
695,AQADEYELMYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALR...,83
718,QQMFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSL...,63
6,DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS,46
464,TMLRCYFPKCSEKNNQGYTPLVVTHNFDFTFSFSPEYSMVFVLFFV,46
539,DVVYTHWYWSGNNSYTGSQYGSRILWWCIVLSVLPLPHRSSKS,43
758,LQTLAANCTICPQRFSVLRNVAHWHGSHTFGNVVDLHRCHQIG,43
771,KSHHIFTEATRSTIECTVNNARESCLYGRALMCKINFSSAIPM,43


<b>3D Structure of Covid</b>
 - File Format
   - pdb : PDBParser() legacy
   - cif : MMCIFParser() recent
   
 - links
   - https://www.ncbi.nlm.nih.gov/Structure/pdb/6LU7
   - Protein Data Bank 


In [47]:
#Import Parser
from Bio.PDB import PDBParser,MMCIFParser

In [48]:
#Reading a PDB file
parser = PDBParser()
structure = parser.get_structure("6lu7.pdb","6lu7.pdb")

In [49]:
structure

<Structure id=6lu7.pdb>

In [50]:
len(structure)

1

In [51]:
model = structure[0]

In [52]:
for chain in model:
    print(chain)

<Chain id=A>
<Chain id=C>


<b>Visualizing the 3D structure</b>
   - using nglview
   - py3Dmol

In [53]:
# View our 3D Structure
import nglview as nv



In [54]:
nv.demo()

NGLWidget()

In [55]:
#Covid 3D structure
view = nv.show_biopython(structure)

In [56]:
view

NGLWidget()

In [57]:
#using py3Dmodel
import py3Dmol

In [58]:
view2 = py3Dmol.view(query = '6LU7')

In [59]:
view2.setStyle({'cartoon':{'color':'spectrum'}})


<py3Dmol.view at 0x74ac328ac0>

In [60]:
view2.render_image()

<py3Dmol.view at 0x74ac328ac0>

<b>Conclusion</b>
   - Analysed the Protein synthesis of Covid 19 by the use of bioinformatics tools and python library
