## M. orygis/MTB/Mbovis differences

This notebook uses nucdiff ..

In [1]:
import os, sys, io, random, subprocess
import string
import numpy as np
import pandas as pd
pd.set_option('display.width',150)
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO, SeqIO

sys.path.append('/home/tortoise/pybioviz/')
from pybioviz import viewers, utils

In [2]:
def align_nucmer(file1, file2):
    cmd='nucmer --maxgap=500 --mincluster=100 --coords -p nucmer %s %s' %(file1, file2)
    print (cmd)
    subprocess.check_output(cmd,shell=True)
    df = read_nucmer_coords('nucmer.coords')
    return df

def read_nucmer_coords(cfile):
    cols=['S1','E1','S2','E2','LEN 1','LEN 2','IDENT','TAG1','TAG2']
    a=pd.read_csv(cfile,sep='[\s|]+',skiprows=5,names=cols,engine='python')
    a = a.sort_values(by='TAG2',ascending=False)
    return a

In [3]:
nuc=align_nucmer('ref_genomes/mbovis.fna','ref_genomes/MTB_h37rv.fna')

nucmer --maxgap=500 --mincluster=100 --coords -p nucmer ref_genomes/mbovis.fna ref_genomes/MTB_h37rv.fna


In [4]:
ref = 'NC_000962.3'
nuc = nuc.sort_values('S2')
nuc

Unnamed: 0,S1,E1,S2,E2,LEN 1,LEN 2,IDENT,TAG1,TAG2
0,1,150902,1,150891,150902,150891,99.82,NC_002945.4,NC_000962.3
74,1778289,1779640,103780,105130,1352,1351,97.05,NC_002945.4,NC_000962.3
183,3835659,3837030,105076,103706,1372,1371,97.09,NC_002945.4,NC_000962.3
1,140094,150914,140054,150903,10821,10850,99.66,NC_002945.4,NC_000962.3
2,151082,264979,150891,264754,113898,113864,99.91,NC_002945.4,NC_000962.3
3,264980,332982,266657,334658,68003,68002,99.93,NC_002945.4,NC_000962.3
26,842872,843855,333649,332699,984,951,87.09,NC_002945.4,NC_000962.3
25,842420,842750,334105,333766,331,340,75.86,NC_002945.4,NC_000962.3
6,335739,336706,334635,335602,968,968,99.90,NC_002945.4,NC_000962.3
20,841408,841944,335165,334635,537,531,79.89,NC_002945.4,NC_000962.3


## region of difference detection with nucmer

* align RD fasta sequences to a reference e.g. bovis,orygis, tb genomes
* check coords table (dataframe) for missing and present regions



In [5]:
#nuc=align_nucmer('ref_genomes/mbovis.fna','ref_genomes/RDs_mtb.fna')
nuc=align_nucmer('ref_genomes/MTB_h37rv.fna','ref_genomes/RDs_mtb.fna')

nucmer --maxgap=500 --mincluster=100 --coords -p nucmer ref_genomes/MTB_h37rv.fna ref_genomes/RDs_mtb.fna


In [6]:
recs = list(SeqIO.parse('ref_genomes/RDs_mtb.fna',format='fasta'))
ids = [r.id for r in recs]
print (ids)

['RD9_1', 'RD711_2', 'RD702_3', 'RD4_4', 'RD1bcg_5', 'RD1mic_6', 'RD2seal_7', 'RD2bcg_8', 'RD7_9', 'RD8_10', 'RD10_11', 'RD12bovis_12', 'RD12can_13', 'RD105_14', 'RD239_15', 'RD750_16', 'RD142_17', 'RD150_18', 'RD181_19', 'RD207_20', 'RD115_21', 'RD122_22', 'RD174_23', 'RD182_24', 'RD183_25', 'RD193_26', 'RD219_27', 'RD724_28', 'RD726_29', 'RD761_30', '7bp_pks15.1']


## 

In [7]:
#featurekeys = ['ID', 'Name', 'del_len', 'ins_len', 'query_coord', 'start', 'end']
featurekeys = ['ID', 'Name', 'del_len', 'ins_len', 'query_bases', 'ref_bases', 'query_coord', 'start', 'end']

def features_to_dataframe(features, cds=False):
    """Get features from a biopython seq record object into a dataframe
    Args:
        features: bio seqfeatures
       returns: a dataframe with a row for each cds/entry.
      """

    #preprocess features
    allfeat = []
    for (item, f) in enumerate(features):
        x = f.__dict__
        q = f.qualifiers
        #print(q)
        x.update(q)
        d = {}
        d['start'] = f.location.start
        d['end'] = f.location.end
        #d['strand'] = f.location.strand
        for i in featurekeys:
            if i in x:
                if type(x[i]) is list:
                    d[i] = x[i][0]
                else:
                    d[i] = x[i]
        allfeat.append(d)

    df = pd.DataFrame(allfeat,columns=featurekeys)

    return df

#df=features_to_dataframe(feats)
#df[:5]

In [11]:
RD = pd.read_csv('RD.csv')
new=[]
for i,r in RD.iterrows():
    if r.Start>3.0e6 and r.Start<4e6:
        new.append(r)        
new = pd.DataFrame(new)
new

Unnamed: 0,RD targets,Start,Stop,Size (bp),Rv in RD
11,RD12bov,3484700,3487500,2801,Rv3117-3121
12,RD12can,3479400,3491800,12401,Rv3111-3126c
19,RD207,3120521,3127920,7400,Rv2814c-2820c
26,RD219,3448504,3451396,2893,Rv3083-3085
28,RD726,3904958,3906706,1749,Rv3485c-3487c
30,7bp pks15/1,3296380,3296381,7,Rv2946c-2947c
31,RD11,3842239,3842769,530,Rv3425


In [15]:
feats = utils.gff_to_features('ref_genomes/Nucdiff_MTB_MB/results/MTB_MB_ref_struct.gff')

feats = utils.gff_to_features('ref_genomes/Nucdiff_MTB_MB/results/MTB_MB_ref_struct.gff')
feat_structs = features_to_dataframe(feats)
deletions = feat_structs[(feat_structs.Name=='deletion') | (feat_structs.Name=='insertion')]

feats_snps = utils.gff_to_features('ref_genomes/Nucdiff_MTB_MB/results/MTB_MB_ref_snps.gff')
feat_snps = features_to_dataframe(feats_snps)
snps = feat_snps[feat_snps.Name=='substitution']

snpdels = pd.concat([deletions, snps])
snpdels

Unnamed: 0,ID,Name,del_len,ins_len,query_bases,ref_bases,query_coord,start,end
1,SV_2,insertion,,35,,,71577-71611,71585,71586
3,SV_4,insertion,,167,,,150915-151081,150902,150903
5,SV_6,deletion,1902,,,,264979,264754,266656
6,SV_8,insertion,,752,,,332983-333734,334657,334658
7,SV_7,deletion,375,,,,332982,334658,335033
9,SV_11,insertion,,2218,,,336707-338924,335601,335602
12,SV_13,insertion,,31,,,363811-363841,362807,362808
17,SV_18,insertion,,67,,,624448-624514,623296,623297
19,SV_20,insertion,,533,,,743871-744403,742633,742634
21,SV_22,deletion,21,,,,838696,836872,836893


In [13]:
RD = pd.read_csv('RD.csv')#, sep='\t')
#print(RD)
RD['Start'] = RD.Start + 21046
RD['Stop'] = RD.Stop + 26845
RD

Unnamed: 0,RD targets,Start,Stop,Size (bp),Rv in RD
0,RD9,2351046,2358945,2029,Rv2073c-2075c
1,RD711,1522759,1530500,1943,Rv1333-1336
2,RD702,237841,245361,1722,Rv0186
3,RD4,1717246,1735645,12732,Rv1506c-1516c
4,RD1bcg,4370646,4385845,9400,Rv3871-3879c
5,RD1mic,4361446,4380345,13100,Rv3864-3871
6,RD2seal,2242046,2249745,1901,Rv1978-1979c
7,RD2bcg,2242146,2258645,10701,Rv1978-1988
8,RD7,2229046,2247345,12719,Rv1964-1977
9,RD8,4077846,4089445,5894,Rv3617-3623


### find RD regions
* In this function the code iterates over each row of found deletions and checks to see if either the start or end coordinates fall within the coordinates of the known regions of deletion.
* If either the start point of the deletion or the end point of the deletion is within the RD's coordinates, then there must be some overlap between the deletion and the RD

In [14]:
found=[]
for i,r in deletions.iterrows():
    #print (r)
    #print (RD.Start, RD.Stop)
    #df = RD[abs(RD.Start-r.start)<500]
    df = RD[((r.start>RD.Start) & (r.start<RD.Stop)) |
              ((r.end>RD.Start) & (r.end<RD.Stop)) |
              ((r.start<RD.Start) & (r.end>RD.Stop))]
    
    if len(df) > 0:
        #print (df)
        #print (r)
        df['id'] = r.ID
        df['Name'] = r.Name
        found.append(df)
        
found = pd.concat(found)
print (found)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


   RD targets    Start     Stop  Size (bp)       Rv in RD      id       Name
12    RD12can  3500446  3518645      12401   Rv3111-3126c  SV_121  insertion
28      RD726  3926004  3933551       1749  Rv3485c-3487c  SV_157   deletion
28      RD726  3926004  3933551       1749  Rv3485c-3487c  SV_160   deletion
28      RD726  3926004  3933551       1749  Rv3485c-3487c  SV_163  insertion
4      RD1bcg  4370646  4385845       9400   Rv3871-3879c  SV_185   deletion
5      RD1mic  4361446  4380345      13100    Rv3864-3871  SV_185   deletion
