In [1]:
import os
import sys
import glob
import scipy
import sklearn
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

## Make new genes table

In [30]:
import inStrain
import Bio
import logging
from Bio import SeqIO

def parse_genbank_genes_COV(gene_file, gene_name='gene'):
    table = defaultdict(list)
    gene2sequence = {}
    gene2aa = {}
    for record in SeqIO.parse(gene_file, 'gb'):
        scaffold = record.id
        for feature in record.features:
            if 'translation' in feature.qualifiers:
                sequence = feature.location.extract(record).seq
                
                gene = feature.qualifiers[gene_name][0]
                ID = feature.qualifiers['protein_id'][0]
                loc = feature.location
                if type(loc) is Bio.SeqFeature.CompoundLocation:
                    partial = 'compound'
                else:
                    partial = False

                table['type'].append(feature.type)
                table['gene'].append(gene)
                table['scaffold'].append(scaffold)
                table['direction'].append(feature.location.strand)
                table['product'].append(feature.qualifiers['product'][0])
                table['translated'].append(True)
                table['ID'].append(ID)

                table['start'].append(loc.start)
                table['end'].append(loc.end - 3)
                table['location'].append(feature.location)
                table['stop_included'].append(True)

                #try:
                gene2sequence[ID] = sequence
                TransSeq = str(sequence.translate()[:-1])
                assert TransSeq == str(feature.qualifiers['translation'][0]), [TransSeq, str(feature.qualifiers['translation'][0])]
                gene2aa[ID] = TransSeq

            elif ('UTR' in feature.type) | ('stem_loop' in feature.type):
                loc = feature.location
                gene = feature.type
                try:
                    ID = feature.qualifiers['locus_tag'][0] + '_stem_loop'
                except:
                    ID = feature.type
                
                table['type'].append(feature.type)
                table['gene'].append(gene)
                table['scaffold'].append(scaffold)
                table['direction'].append(feature.location.strand)
                table['product'].append(np.nan)
                table['translated'].append(False)
                table['ID'].append(ID)

                table['start'].append(loc.start)
                table['end'].append(loc.end)
                table['location'].append(feature.location)
                table['stop_included'].append(np.nan)
                
            elif feature.type == 'mat_peptide':
                loc = feature.location
                gene = feature.qualifiers['product'][0]
                ID = feature.qualifiers['protein_id'][0]
                sequence = feature.location.extract(record).seq

                table['type'].append(feature.type)
                table['gene'].append(gene)
                table['scaffold'].append(scaffold)
                table['direction'].append(feature.location.strand)
                table['product'].append(feature.qualifiers['product'][0])
                table['translated'].append(True)
                table['ID'].append(ID)

                table['start'].append(loc.start)
                table['end'].append(loc.end)
                table['location'].append(feature.location)
                table['stop_included'].append(False)
                
                gene2sequence[ID] = sequence
                TransSeq = str(sequence.translate()[:-1])
                #assert TransSeq == str(feature.qualifiers['translation'][0]), [TransSeq, str(feature.qualifiers['translation'][0])]
                gene2aa[ID] = TransSeq
                
            elif feature.type in ['gene', 'source']:
                pass

            else:
                pass
                print(feature.type)

    Gdb = pd.DataFrame(table)

    return Gdb, gene2sequence, gene2aa, record.seq

Gdb, gene2sequence, gene2aa, source = parse_genbank_genes_COV('/home/mattolm/user_data/Covid_19/genomes/NC_045512.2.gb')

In [49]:
Gdb

Unnamed: 0,type,gene,scaffold,direction,product,translated,ID,start,end,location,stop_included
0,5'UTR,5'UTR,NC_045512.2,1,,False,5'UTR,0,265,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
1,CDS,orf1ab,NC_045512.2,1,orf1ab polyprotein,True,YP_009724389.1,265,21552,"(265, 266, 267, 268, 269, 270, 271, 272, 273, ...",True
2,mat_peptide,leader protein,NC_045512.2,1,leader protein,True,YP_009725297.1,265,805,"(265, 266, 267, 268, 269, 270, 271, 272, 273, ...",False
3,mat_peptide,nsp2,NC_045512.2,1,nsp2,True,YP_009725298.1,805,2719,"(805, 806, 807, 808, 809, 810, 811, 812, 813, ...",False
4,mat_peptide,nsp3,NC_045512.2,1,nsp3,True,YP_009725299.1,2719,8554,"(2719, 2720, 2721, 2722, 2723, 2724, 2725, 272...",False
5,mat_peptide,nsp4,NC_045512.2,1,nsp4,True,YP_009725300.1,8554,10054,"(8554, 8555, 8556, 8557, 8558, 8559, 8560, 856...",False
6,mat_peptide,3C-like proteinase,NC_045512.2,1,3C-like proteinase,True,YP_009725301.1,10054,10972,"(10054, 10055, 10056, 10057, 10058, 10059, 100...",False
7,mat_peptide,nsp6,NC_045512.2,1,nsp6,True,YP_009725302.1,10972,11842,"(10972, 10973, 10974, 10975, 10976, 10977, 109...",False
8,mat_peptide,nsp7,NC_045512.2,1,nsp7,True,YP_009725303.1,11842,12091,"(11842, 11843, 11844, 11845, 11846, 11847, 118...",False
9,mat_peptide,nsp8,NC_045512.2,1,nsp8,True,YP_009725304.1,12091,12685,"(12091, 12092, 12093, 12094, 12095, 12096, 120...",False


In [41]:
# pp1ab
print(Gdb.iloc[1]['location'])

join{[265:13468](+), [13467:21555](+)}


In [43]:
# pp1ab
print(Gdb.iloc[17]['location'])

[265:13483](+)


# Convert to positional

In [61]:
def make_positional_gene_table_COV(Gdb, gene2sequence, gene2aa, source):
    # Make a table of the genome
    table = defaultdict(list)
    for i, char in enumerate(source):
        table['position'].append(i)
        table['ref_seq'].append(char)
    Sdb = pd.DataFrame(table)
    l2c = Sdb.set_index('position')['ref_seq'].to_dict()
    
    # Add translations for gene products
    table = defaultdict(list)
    for j, row in Gdb[Gdb['translated'] == True].iterrows():
            
        NSEQ = gene2sequence[row['ID']]
        location = row['location']
        ASEQ = gene2aa[row['ID']]
        EXPA = []
        for a in ASEQ:
            EXPA += [a] * 3
        EXPA += ['stop'] * 3
        assert len(NSEQ) == len(location) == len(EXPA), [len(NSEQ), len(location), len(EXPA)]

        pos = 0
        for loc, char, a in zip(location, NSEQ, EXPA):
            table['position'].append(loc)
            table['ID'].append(row['ID'])
            table['coding'].append(True)
            table['orf_position'].append((pos) % 3)
            table['aa'].append(a)
            table['aa_index'].append(int(pos/3))
            assert l2c[loc] == char, [row['gene'],l2c[loc], char]
            pos += 1
        assert (pos) % 3 == 0
    Tdb = pd.DataFrame(table)
    #print(Tdb[Tdb.duplicated(subset=['location'], keep=False)].sort_values('location'))
        
    # Add genes
    table = defaultdict(list)
    for j, row in Gdb[Gdb['translated'] == False].iterrows():
        location = row['location']
        for loc in location:
            table['position'].append(loc)
            table['ID'].append(row['ID'])
            table['coding'].append(False)
    gdb = pd.DataFrame(table)
    
    # Merge
    db = pd.concat([pd.merge(Sdb, Tdb, on='position', how='left'), pd.merge(Sdb, gdb, on='position', how='left')])
    return db
        
POdb = make_positional_gene_table_COV(Gdb, gene2sequence, gene2aa, source)

39

## Save

In [35]:
GOdb = Gdb.copy()
GOdb['location'] = [str(x) for x in GOdb['location']]
GOdb.to_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_table_v4.csv', index=False)
GOdb.head()

Unnamed: 0,type,gene,scaffold,direction,product,translated,ID,start,end,location,stop_included
0,5'UTR,5'UTR,NC_045512.2,1,,False,5'UTR,0,265,[0:265](+),
1,CDS,orf1ab,NC_045512.2,1,orf1ab polyprotein,True,YP_009724389.1,265,21552,"join{[265:13468](+), [13467:21555](+)}",True
2,mat_peptide,leader protein,NC_045512.2,1,leader protein,True,YP_009725297.1,265,805,[265:805](+),False
3,mat_peptide,nsp2,NC_045512.2,1,nsp2,True,YP_009725298.1,805,2719,[805:2719](+),False
4,mat_peptide,nsp3,NC_045512.2,1,nsp3,True,YP_009725299.1,2719,8554,[2719:8554](+),False


In [36]:
POdb.to_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_positional_v4.csv', index=False)
POdb.head()

Unnamed: 0,position,ref_seq,ID,coding,orf_position,aa,aa_index
0,0,A,,,,,
1,1,T,,,,,
2,2,T,,,,,
3,3,A,,,,,
4,4,A,,,,,
