## M. orygis/MTB/Mbovis differences

This notebook uses nucdiff ..

In [2]:
import os, sys, io, random, subprocess
import string
import numpy as np
import pandas as pd

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO, SeqIO

sys.path.append('/home/tortoise/pybioviz/')
from pybioviz import viewers, utils

In [3]:
def align_nucmer(file1, file2):
    cmd='nucmer --maxgap=500 --mincluster=100 --coords -p nucmer %s %s' %(file1, file2)
    print (cmd)
    subprocess.check_output(cmd,shell=True)
    df = read_nucmer_coords('nucmer.coords')
    return df

def read_nucmer_coords(cfile):
    cols=['S1','E1','S2','E2','LEN 1','LEN 2','IDENT','TAG1','TAG2']
    a=pd.read_csv(cfile,sep='[\s|]+',skiprows=5,names=cols,engine='python')
    a = a.sort_values(by='TAG2',ascending=False)
    return a

In [3]:
nuc=align_nucmer('ref_genomes/mbovis.fna','ref_genomes/MTB_h37rv.fna')

nucmer --maxgap=500 --mincluster=100 --coords -p nucmer ref_genomes/mbovis.fna ref_genomes/MTB_h37rv.fna


In [None]:
ref = 'NC_000962.3'
nuc = nuc.sort_values('S2')
nuc

## region of difference detection with nucmer

* align RD fasta sequences to a reference e.g. bovis,orygis, tb genomes
* check coords table (dataframe) for missing and present regions



In [4]:
#nuc=align_nucmer('ref_genomes/mbovis.fna','ref_genomes/RDs_mtb.fna')
nuc=align_nucmer('ref_genomes/MTB_h37rv.fna','ref_genomes/RDs_mtb.fna')

nucmer --maxgap=500 --mincluster=100 --coords -p nucmer ref_genomes/MTB_h37rv.fna ref_genomes/RDs_mtb.fna


In [5]:
recs = list(SeqIO.parse('ref_genomes/RDs_mtb.fna',format='fasta'))
ids = [r.id for r in recs]
print (ids)

['RD9_1', 'RD711_2', 'RD702_3', 'RD4_4', 'RD1bcg_5', 'RD1mic_6', 'RD2seal_7', 'RD2bcg_8', 'RD7_9', 'RD8_10', 'RD10_11', 'RD12bovis_12', 'RD12can_13', 'RD105_14', 'RD239_15', 'RD750_16', 'RD142_17', 'RD150_18', 'RD181_19', 'RD207_20', 'RD115_21', 'RD122_22', 'RD174_23', 'RD182_24', 'RD183_25', 'RD193_26', 'RD219_27', 'RD724_28', 'RD726_29', 'RD761_30', '7bp_pks15.1']


## 



In [7]:
featurekeys = ['ID', 'Name', 'del_len', 'ins_len', 'query_coord', 'start', 'end']

def features_to_dataframe(features, cds=False):
    """Get features from a biopython seq record object into a dataframe
    Args:
        features: bio seqfeatures
       returns: a dataframe with a row for each cds/entry.
      """

    #preprocess features
    allfeat = []
    for (item, f) in enumerate(features):
        x = f.__dict__
        q = f.qualifiers
        #print(q)
        x.update(q)
        d = {}
        d['start'] = f.location.start
        d['end'] = f.location.end
        #d['strand'] = f.location.strand
        for i in featurekeys:
            if i in x:
                if type(x[i]) is list:
                    d[i] = x[i][0]
                else:
                    d[i] = x[i]
        allfeat.append(d)

    df = pd.DataFrame(allfeat,columns=featurekeys)

    return df

#df=features_to_dataframe(feats)
#df[:5]

In [8]:
feats = utils.gff_to_features('ref_genomes/Nucdiff_MTB_MB/results/MTB_MB_ref_struct.gff')
feat_df = features_to_dataframe(feats)
#print(feat_df)
deletions = feat_df[feat_df.Name=='deletion']
len(deletions)

40

In [10]:
RD = pd.read_csv('RD.csv', sep='\t')
print(RD[:5])

   No. RD targets    Start     Stop  Size (bp)       Rv in RD
0    1        RD9  2330000  2332100       2029  Rv2073c-2075c
1    2      RD711  1501713  1503655       1943    Rv1333-1336
2    3      RD702   216795   218516       1722         Rv0186
3    4        RD4  1696200  1708800      12732  Rv1506c-1516c
4    5     RD1bcg  4349600  4359000       9400   Rv3871-3879c


### find RD regions
* In this function the code iterates over each row of found deletions and checks to see if either the start or end coordinates fall within the coordinates of the known regions of deletion.
* If either the start point of the deletion or the end point of the deletion is within the RD's coordinates, then there must be some overlap between the deletion and the RD

In [16]:
for i,r in list(deletions.iterrows())[:20]:
    #print (r.start,r.end)
    #print (RD.Start, RD.Stop)
    print (RD[abs(RD.Start-r.start)<500])
    
    #if r.start < RD.Start) & RD.Start < r.end | r.start < RD.Stop & RD.Stop < r.end:
       #print(RD.Start)

    No. RD targets   Start    Stop  Size (bp)     Rv in RD
10   11       RD10  264700  266600       1902  Rv0221-0223
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFrame
Columns: [No., RD targets, Start, Stop, Size (bp), Rv in RD]
Index: []
Empty DataFr

Unnamed: 0,No.,RD targets,Start,Stop,Size (bp),Rv in RD
0,1,RD9,2330000,2332100,2029,Rv2073c-2075c
1,2,RD711,1501713,1503655,1943,Rv1333-1336
2,3,RD702,216795,218516,1722,Rv0186
3,4,RD4,1696200,1708800,12732,Rv1506c-1516c
4,5,RD1bcg,4349600,4359000,9400,Rv3871-3879c
