# Calling NucDiff
#### DATE: 02-08-19
#### TASK: 
####       - Call NucDiff on reference file (H37rv) and query file (M. orygis or M. bovis)
####       - Return relevent information in a dataframe

In [3]:
import os, sys, io, random, subprocess
import string
import numpy as np
import pandas as pd
pd.set_option('display.width',150)
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO, SeqIO

sys.path.append('/home/tortoise/pybioviz/')
from pybioviz import viewers, utils

In [4]:
# Here, a set of features are defined.
# These features are what shall be extracted from inputted gff files, and are used as column headers in the dataframe
# Many titles will return as NaN however allows merging of different features, such as SNPs and Insertions
featurekeys = ['ID', 'Name', 'del_len', 'ins_len', 'query_bases', 'ref_bases', 'query_coord', 'start', 'end']
def features_to_dataframe(features, cds=False):
    """Get features from a biopython seq record object into a dataframe
    Args:
        features: bio seqfeatures
       returns: a dataframe with a row for each cds/entry.
      """
    
    #preprocess features
    allfeat = []
    for (item, f) in enumerate(features):
        x = f.__dict__
        q = f.qualifiers
        #print(q)
        x.update(q)
        d = {}
        d['start'] = f.location.start
        d['end'] = f.location.end
        #d['strand'] = f.location.strand
        for i in featurekeys:
            if i in x:
                if type(x[i]) is list:
                    d[i] = x[i][0]
                else:
                    d[i] = x[i]
        allfeat.append(d)

    df = pd.DataFrame(allfeat,columns=featurekeys)

    return df

In [5]:
# Here the previous test NucDiff test folder is removed

remove_test = 'rm -r MTB_h37rv_mbovis'
code = os.system(remove_test)
print(code)

0


Here the NucDiff command is defined. Currently, it is hardcoded, but sys.argv arguments will be written in.


In [6]:
def run_nucdiff(ref, query):
    """Run nucdiff"""
    
    outfolder = os.path.basename(ref).split('.')[0] + '_' +  os.path.basename(query).split('.')[0]
    outprefix = os.path.basename(query).split('.')[0]
    
    #nucDiffCmd = 'nucdiff' + ref + " " + query " " + outputFolder + " " + outputPrefix
    nucDiffCmd = f'nucdiff {ref} {query} {outfolder} {outprefix}'
    
    print (nucDiffCmd)
    os.system(nucDiffCmd)
    return
    
run_nucdiff('ref_genomes/MTB_h37rv.fna','ref_genomes/mbovis.fna')
run_nucdiff('ref_genomes/MTB_h37rv.fna','ref_genomes/morygis_ncbi.fna')

nucdiff ref_genomes/MTB_h37rv.fna ref_genomes/mbovis.fna MTB_h37rv_mbovis mbovis
nucdiff ref_genomes/MTB_h37rv.fna ref_genomes/morygis_ncbi.fna MTB_h37rv_morygis_ncbi morygis_ncbi


## find common differences morygis and mbovis vs mtb

In [7]:
# Here the features extracting function is defined
#featurekeys = ['ID', 'Name', 'del_len', 'ins_len', 'query_bases', 'ref_bases', 'query_coord', 'start', 'end']

def features_to_dataframe(features, cds=False):
    """Get features from a biopython seq record object into a dataframe
    Args:
        features: bio seqfeatures
       returns: a dataframe with a row for each cds/entry.
      """

    #preprocess features
    allfeat = []
    for (item, f) in enumerate(features):
        x = f.__dict__
        quals = f.qualifiers
        #print(q)
        x.update(quals)
        d = {}
        d['start'] = f.location.start
        d['end'] = f.location.end
        #d['strand'] = f.location.strand
     
        cols = list(quals.keys())+['start','end']
        for i in quals:
            if i in x:
                if type(x[i]) is list:
                    d[i] = x[i][0]
                else:
                    d[i] = x[i]
        allfeat.append(d)

    df = pd.DataFrame(allfeat,columns=cols)

    return df

In [None]:
#read in both gffs from nucdiff

# This extracts the snps from the snps.gff file and returns a dataframe
def get_nucdiff_snps(gff):
    feats = utils.gff_to_features(gff)
    df = features_to_dataframe(feats)
    results_df = df[df.Name=='substitution']
    return results_df

# This extracts the indels from the structs.gff file and returns a dataframe 
def get_nucdiff_indels(gff):
    feats = utils.gff_to_features(gff)
    df = features_to_dataframe(feats)
    results_df = df[(df.Name=='deletion') | (df.Name=='insertion')]
    return df    #####

# This parses the mbovis results and returns a dataframe of snps and indels
for filename in os.listdir('MTB_h37rv_mbovis/results'):
        if "ref_snps" in filename:
            mbovis_snps_df = get_nucdiff_snps(f'MTB_h37rv_mbovis/results/{filename}')
        elif "ref_struct" in filename:
            mbovis_indels_df = get_nucdiff_indels(f'MTB_h37rv_mbovis/results/{filename}')

# This parses the morygis results and returns a dataframe of snps and indels
for filename in os.listdir('MTB_h37rv_morygis_ncbi/results'):
        if "ref_snps" in filename:
            morygis_snps_df = get_nucdiff_snps(f'MTB_h37rv_morygis_ncbi//results/{filename}')
        elif "ref_struct" in filename:
            morygis_indels_df = get_nucdiff_indels(f'MTB_h37rv_morygis_ncbi//results/{filename}')



In [33]:
# This forms a combined snps and indels df for each strain            
mbovis_snpdels = pd.concat([mbovis_indels_df, mbovis_snps_df])
morygis_snpdels = pd.concat([morygis_indels_df, morygis_snps_df])

#common_differences = pd.merge(mbovis_snpdels, morygis_snpdels, on=['start', 'end'], how='inner')
#common_differences = common_differences.drop(columns=['ID_x', 'ID_y', 'query_coord_y', 'query_coord_x'])

common_indels = pd.merge(mbovis_indels_df, morygis_indels_df, on=['start', 'end'], how='inner').drop(columns=['ID_x',
                                                                    'ID_y', 'query_coord_y', 'query_coord_x', 'source_x', 'source_y', 
                                                                    'query_sequence_y', 'query_sequence_x', 
                                                                        'query_dir', 'color_y', 'color_x', 'Name_y'])
common_indels.rename(columns={'Name_x':'Name'}, inplace=True)
print(common_indels[15:25])
#merge them on start position

                  Name del_len ref_repeated_region    start      end breakpoint_query blk_query blk_ref blk_query_len blk_ref_len
15  tandem_duplication     NaN                 NaN   802499   802500              NaN       NaN     NaN           NaN         NaN
16            deletion      21                 NaN   836872   836893              NaN       NaN     NaN           NaN         NaN
17  tandem_duplication     NaN                 NaN   839111   839112              NaN       NaN     NaN           NaN         NaN
18           insertion     NaN                 NaN  1213691  1213692              NaN       NaN     NaN           NaN         NaN
19           insertion     NaN                 NaN  1213691  1213692              NaN       NaN     NaN           NaN         NaN
20         duplication     NaN     1213680-1213692  1213691  1213692              NaN       NaN     NaN           NaN         NaN
21         duplication     NaN     1213680-1213692  1213691  1213692              NaN     

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [35]:
# Here the RD df is read in

def find_regions(data):
    """match regions of difference"""
    
    RD = pd.read_csv('RD.csv', comment='#')
    found=[]
    print (len(data))
    for i,r in data.iterrows():
        df = RD[((r.start>RD.Start) & (r.start<RD.Stop)) |
                  ((r.end>RD.Start) & (r.end<RD.Stop)) |
                  ((r.start<RD.Start) & (r.end>RD.Stop))]
        #df = RD[abs(RD.Start-r.start)<500]
        if len(df) > 0:
            #print (df)
            #print (r)
            df['indel_start'] = r.start
            df['indel_end'] = r.end
            df['name'] = r.Name
            df['del_len'] = r.end-r.start
            found.append(df)

    found = pd.concat(found)
    return found

mb = find_regions(mbovis_indels_df)
mo = find_regions(morygis_indels_df)
mbmo = find_regions(common_indels)

print (len(mb),len(mo))

66


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


32 31


In [37]:
set(mb.RD_name) & set(mo.RD_name)

   RD_name    Start     Stop   Size             Rv  indel_start  indel_end        name  del_len
30    RD10   264067   267764   3697  Rv0221-Rv0223       264754     266656    deletion     1902
27     RD7  2207700  2220800  13100  Rv1964-Rv1977      2208004    2208005  relocation        1
27     RD7  2207700  2220800  13100  Rv1964-Rv1977      2220724    2220725  relocation        1
29     RD9  2328974  2332879   3905    CobL-Rv2075      2330073    2332101    deletion     2028
28     RD8  4057733  4063249   5516      EphA-lpqG      4056840    4062733    deletion     5893


{'RD10', 'RD11', 'RD14', 'RD182', 'RD207', 'RD5', 'RD7', 'RD8', 'RD9'}

In [155]:
mb.sort_values('del_len')

Unnamed: 0,RD_name,Start,Stop,Size (bp),Rv in RD,indel_start,indel_end,name,del_len
27,RD7,2207700,2220800,13100,Rv1964-Rv1977,2208004,2208005,relocation,1
22,RD2,2220908,2232219,11311,Rv1978-Rv1988,2220724,2220725,relocation,1
31,RD11,2970123,2980818,10695,Rv2645-Rv2659c,2970015,2970016,relocation-overlap,1
8,RD207,3120521,3127920,7400,Rv2814c-2820c,3120305,3120524,collapsed_tandem_repeat,219
34,RD14,1997418,2007766,10348,Rv1765c-Rv1773c,1997053,1997452,deletion,399
32,RD12,3484809,3487711,2902,SseC-Rv3121,3484740,3487514,deletion,2774
33,RD13,1402778,1406084,3306,Rv1255c-Rv1257c,1402937,1405937,deletion,3000
25,RD5,2625888,2635592,9704,Rv2346c-Rv2353c,2626069,2635030,deletion,8961


In [None]:
#RD5:   NC_000962.3:2,625,888-2,635,592  Similar         (Not flagged as same)
#RD10:  NC_000962.3:264,067-267,764      Common
#RD9:   NC_000962.3:2,328,974-2,332,879  Common
#RD8    NC_000962.3:4,053,166-4,066,581  Common

In [38]:
mbmo

Unnamed: 0,RD_name,Start,Stop,Size,Rv,indel_start,indel_end,name,del_len
30,RD10,264067,267764,3697,Rv0221-Rv0223,264754,266656,deletion,1902
27,RD7,2207700,2220800,13100,Rv1964-Rv1977,2208004,2208005,relocation,1
27,RD7,2207700,2220800,13100,Rv1964-Rv1977,2220724,2220725,relocation,1
29,RD9,2328974,2332879,3905,CobL-Rv2075,2330073,2332101,deletion,2028
28,RD8,4057733,4063249,5516,EphA-lpqG,4056840,4062733,deletion,5893
