In [1]:
%cd ~/comp_mut

/mnt/storage7/gary/comp_mut


In [2]:
%reset -f

In [3]:
import json
from collections import defaultdict, Counter
import argparse
import os
from tqdm import tqdm
import sys
import csv
import pathogenprofiler as pp
import tbprofiler
from csv import DictReader
from collections import Counter
import requests
from contextlib import closing
import re
from python_scripts.utils import *

In [4]:
def get_vars_exclude(vars_exclude_file):

    # URL below is the results of all Fst = 1 variants from https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-020-00817-3
    fst_results_url = 'https://raw.githubusercontent.com/GaryNapier/tb-lineages/main/fst_results_clean_fst_1_for_paper.csv'
    # See https://www.codegrepper.com/code-examples/python/how+to+read+a+csv+file+from+a+url+with+python for pulling data from url
    with closing(requests.get(fst_results_url, stream=True)) as r:
        f = (line.decode('utf-8') for line in r.iter_lines())
        fst_dict = csv_to_dict_multi(f)
    
    lin_specific_variants = []
    for gene in fst_dict:
        for var in fst_dict[gene]:
            lin_specific_variants.append( tuple( [ gene, reformat_mutations(var['aa_pos']) ] ) )

    # Read in variants to be excluded
    vars_exclude = []
    for l in open(vars_exclude_file):
        vars_exclude.append(tuple(l.strip().split(',')))

    # Concat
    vars_exclude = vars_exclude + lin_specific_variants 
    return vars_exclude

def get_counts(meta,samples,col):
    return dict(Counter([meta[s][col] for s in samples]))

def get_meta_proportion(meta,samples,column,targets):
    tmp = get_counts(meta,samples,column)
    target_count = sum([tmp.get(c,0) for c in targets])
    return round(target_count/sum(tmp.values()), 3)

def filter_vars(variants, mutation2sample, meta_dict, drug_of_interest):

    # Get stats for each variant
    stats_dict = defaultdict(dict)
    variants_passed = set()
    # for var in variants:
    for var in variants:

        # Get proportion or number of samples (in the case of lineage) per potential mutation
        samps = mutation2sample[var]

        if len(samps)<3: continue

        dst_proportion = get_meta_proportion(meta_dict,samps,drug_of_interest,['1'])
        sensitive_geno_proportion = get_meta_proportion(meta_dict,samps,'drtype',['Sensitive'])
        num_lins = len(set(resolve_lineages(get_counts(meta_dict,samps,'sublin'))))

        # Filter and add to dict
        if dst_proportion >= 0.5 and sensitive_geno_proportion <= 0.5 and num_lins > 1:
                
            variants_passed.add(var)

            stats_dict[var] = {'drug': drug_of_interest, 
                               'gene': var[0], 
                               'mutation': var[1],
                               'gene_mutation': var[0] + '-' + var[1],
                               'n_samps' : len(samps), 
                               'dst_prop': dst_proportion, 
                               'dr_prop': sensitive_geno_proportion, 
                               'n_lins': num_lins}

    return (variants_passed, stats_dict)

In [5]:
drug_of_interest = 'isoniazid'

In [6]:
# FILES

# potential_comp_mut_file = args.potential_comp_mut_file
# metadata_file = args.metadata_file
# tbdb_file = args.tbdb_file
# drtypes_file = args.drtypes_file
# known_comp_mut_file = args.known_comp_mut_file
# tbprofiler_results_dir = args.tbprofiler_results_dir
# vars_exclude_file = args.vars_exclude_file
# potential_res_mut_outfile = args.potential_res_mut_outfile

# FILES

# potential_comp_mut_file = "results/%s_novel_comp_mut_model_results.csv" % drug_of_interest
potential_comp_mut_file = "results/%s_novel_comp_mut_merged.csv" % drug_of_interest
metadata_file = "../metadata/tb_data_18_02_2021.csv"
tbdb_file = "../tbdb/tbdb.csv"
drtypes_file = "../pipeline/db/dr_types.json"
known_comp_mut_file = '../pipeline/db/compensatory_mutations.csv'
tbprofiler_results_dir = '/mnt/storage7/jody/tb_ena/tbprofiler/freebayes/results/'
vars_exclude_file = 'metadata/var_exclude_comp_mut.csv'
potential_res_mut_stats_file = 'results/potential_res_mut_stats.csv'
potential_res_mut_samples_file = 'results/potential_res_mut_samps.csv'


In [7]:
# VARIABLES

# suffix = args.suffix
# drug_of_interest = args.drug_of_interest

suffix = ".results.json"

In [8]:
# Read in potential CM results file
with open(potential_comp_mut_file, 'r') as f:
    reader = csv.DictReader(f)        
    potential_comp_mut_list = []
    for row in reader:
        potential_comp_mut_list.append((row['gene'], row['change']))

potential_comp_mut_list

[('ahpC', 'c.-142G>A'),
 ('ahpC', 'c.-47_-46insT'),
 ('ahpC', 'c.-48G>A'),
 ('ahpC', 'c.-51G>A'),
 ('ahpC', 'c.-52C>A'),
 ('ahpC', 'c.-52C>T'),
 ('ahpC', 'c.-54C>T'),
 ('ahpC', 'c.-57C>T'),
 ('ahpC', 'c.-72C>T'),
 ('ahpC', 'c.-76T>A'),
 ('ahpC', 'c.-77T>G'),
 ('ahpC', 'c.-88G>A'),
 ('ahpC', 'p.Asp73His'),
 ('ahpC', 'p.Gly32Asp'),
 ('ahpC', 'p.Leu191Phe'),
 ('ahpC', 'p.Pro44Arg')]

In [9]:
# Convert to list of tuples
# potential_comp_mut_list = [(potential_comp_mut_dict[var]['gene'], potential_comp_mut_dict[var]['change'])\
#  for var in list(potential_comp_mut_dict)]
# potential_comp_mut_list

In [10]:
# Read in metadata
with open(metadata_file) as mf:
    meta_dict = csv_to_dict(mf)
    
# Pull samples
samples = list(meta_dict.keys())

In [11]:
# Read in tbdb file
with open(tbdb_file, 'r') as f:
    tbdb_dict = csv_to_dict_multi(f, 'Drug')
    
krm = [(var['Gene'], var['Mutation']) for var in tbdb_dict[drug_of_interest]]
print(len(krm))
print(krm[0:5])



419
[('katG', 'c.1002dupG'), ('katG', 'c.1016delA'), ('katG', 'c.1024dupG'), ('katG', 'c.1058_1060dupACA'), ('katG', 'c.1144dupC')]


In [12]:
# Read in DR types from json
standardise_drtype = json.load(open(drtypes_file))

In [13]:
# Get known compensatory mutations of interest
compensatory_mutations = defaultdict(set)
for row in csv.DictReader(open(known_comp_mut_file)):
    if row['Drug'] != drug_of_interest: continue
    compensatory_mutations[row['Drug']].add((row['Gene'],row['Mutation']))

In [14]:
# Read in variants to exclude
vars_exclude = get_vars_exclude(vars_exclude_file)
vars_exclude

[('katG', 'p.Arg463Leu'),
 ('katG', 'p.Trp300Cys'),
 ('pstP', 'p.Asp74Ala'),
 ('pstP', None),
 ('pstP', None),
 ('pstP', 'p.Ala205Val'),
 ('pstP', 'p.Ala44Val'),
 ('pstP', 'p.Arg283His'),
 ('Rv0021c', 'p.Asp179His'),
 ('Rv0021c', None),
 ('Rv0021c', None),
 ('Rv0061c', None),
 ('Rv0061c', 'p.Thr39Ala'),
 ('celA1', 'p.His38Asp'),
 ('celA1', 'p.Val76Gly'),
 ('celA1-Rv0063', None),
 ('Rv0063', 'p.Ser418Pro'),
 ('Rv0063', None),
 ('Rv0063', None),
 ('Rv0063', 'p.Ala238Ser'),
 ('Rv0063', 'p.Pro162Ala'),
 ('Rv0063', 'p.Ala155Thr'),
 ('Rv0063', 'p.Gln403*'),
 ('Rv0067c', 'p.Glu154Asp'),
 ('Rv0067c', 'p.Pro3Leu'),
 ('Rv0067c', 'p.Leu132Val'),
 ('Rv0067c', None),
 ('Rv0090', 'p.Ala163Pro'),
 ('nrp', None),
 ('nrp', 'p.Ser2432Asn'),
 ('nrp', 'p.Asp551Asn'),
 ('nrp', 'p.Cys2301Gly'),
 ('nrp', 'p.Arg379Cys'),
 ('nrp', 'p.Gly779Asp'),
 ('nrp', 'p.Leu655Met'),
 ('nrp', 'p.Ala812Thr'),
 ('nrp', 'p.Ala1214Val'),
 ('nrp', 'p.Pro1291Leu'),
 ('nrp', None),
 ('nrp', 'p.Leu1280Phe'),
 ('nrp', 'p.Ala2468Val

In [15]:
# Find genes associated with drug of interest
genes = set()
for var in tbdb_dict[drug_of_interest]:
    genes.add(var['Gene'])

# Concat with genes from known and potential resistance mutations
genes = genes.union(set([var[0] for var in compensatory_mutations[drug_of_interest]]))
genes = genes.union(set([var[0] for var in potential_comp_mut_list]))

genes

{'ahpC', 'fabG1', 'inhA', 'kasA', 'katG'}

In [16]:
# Load mutation data using ('gene','change') as keys
mutation2sample = defaultdict(set)
sample2mutation = defaultdict(set)
resistance_mutations = defaultdict(set)
for s in tqdm(samples):
    file = "%s/%s%s" % (tbprofiler_results_dir, s, suffix)
    if os.path.isfile(file):
        data = json.load(open(file))
        # Skip mixed samps
        if ';' in data['sublin']: continue

        # Update metadata
        meta_dict[s]['drtype'] = standardise_drtype[data['drtype']]
        meta_dict[s]['sublin'] = data['sublin']

        # MAKE SURE THE FOR LOOP BELOW IS INDENTED IN LINE WITH if os.path.isfile(file):
        # Otherwise adds sample s to mutation2sample etc

        for var in data['dr_variants'] + data['other_variants']:
            if var['gene'] not in genes: continue
            if var['freq'] < 0.7: continue
            if var['type']=='synonymous_variant': continue
            if (var['gene'], var['change']) in vars_exclude: continue
            key = (var['gene'],var['change'])
            mutation2sample[key].add(s)
            sample2mutation[s].add(key)
            if "drugs" in var:
                for d in var["drugs"]:
                    if d["drug"] not in drug_of_interest: continue
                    if key in compensatory_mutations[d["drug"]] or key in potential_comp_mut_list: continue
                    resistance_mutations[d["drug"]].add(key)

100%|██████████| 32735/32735 [00:15<00:00, 2176.07it/s]


In [17]:
# TESTING
samp = 'ERR2864254'

file = "%s/%s%s" % (tbprofiler_results_dir, samp, suffix)

data = json.load(open(file))

# for var in data["other_variants"]:
#     print(var['change'])
#     print(var['type'])
#     print('---')

In [18]:
# Classify potential compensatory mutations and filter 
# GLM models (filter_novel_comp_mut.R) is only first step in identifying 'interesting' compensatory mutations
# Need to check against tbprofiler results for each mutation 
# e.g. if the mutation is lineage specific, then filter out
potential_comp_mut_filtered, potential_comp_mut_stats = filter_vars(potential_comp_mut_list, mutation2sample, meta_dict, drug_of_interest)
potential_comp_mut_list

[('ahpC', 'c.-142G>A'),
 ('ahpC', 'c.-47_-46insT'),
 ('ahpC', 'c.-48G>A'),
 ('ahpC', 'c.-51G>A'),
 ('ahpC', 'c.-52C>A'),
 ('ahpC', 'c.-52C>T'),
 ('ahpC', 'c.-54C>T'),
 ('ahpC', 'c.-57C>T'),
 ('ahpC', 'c.-72C>T'),
 ('ahpC', 'c.-76T>A'),
 ('ahpC', 'c.-77T>G'),
 ('ahpC', 'c.-88G>A'),
 ('ahpC', 'p.Asp73His'),
 ('ahpC', 'p.Gly32Asp'),
 ('ahpC', 'p.Leu191Phe'),
 ('ahpC', 'p.Pro44Arg')]

In [19]:
print(" --- list of potential comp mutations after filtering --- ")
print(potential_comp_mut_filtered)
print(" --- stats: --- ")
print(potential_comp_mut_stats)
print()

 --- list of potential comp mutations after filtering --- 
{('ahpC', 'c.-54C>T'), ('ahpC', 'p.Gly32Asp'), ('ahpC', 'c.-52C>T'), ('ahpC', 'c.-48G>A'), ('ahpC', 'c.-57C>T'), ('ahpC', 'c.-52C>A'), ('ahpC', 'c.-72C>T'), ('ahpC', 'c.-51G>A')}
 --- stats: --- 
defaultdict(<class 'dict'>, {('ahpC', 'c.-48G>A'): {'drug': 'isoniazid', 'gene': 'ahpC', 'mutation': 'c.-48G>A', 'gene_mutation': 'ahpC-c.-48G>A', 'n_samps': 92, 'dst_prop': 0.576, 'dr_prop': 0.0, 'n_lins': 17}, ('ahpC', 'c.-51G>A'): {'drug': 'isoniazid', 'gene': 'ahpC', 'mutation': 'c.-51G>A', 'gene_mutation': 'ahpC-c.-51G>A', 'n_samps': 45, 'dst_prop': 0.644, 'dr_prop': 0.0, 'n_lins': 10}, ('ahpC', 'c.-52C>A'): {'drug': 'isoniazid', 'gene': 'ahpC', 'mutation': 'c.-52C>A', 'gene_mutation': 'ahpC-c.-52C>A', 'n_samps': 49, 'dst_prop': 0.51, 'dr_prop': 0.0, 'n_lins': 6}, ('ahpC', 'c.-52C>T'): {'drug': 'isoniazid', 'gene': 'ahpC', 'mutation': 'c.-52C>T', 'gene_mutation': 'ahpC-c.-52C>T', 'n_samps': 97, 'dst_prop': 0.701, 'dr_prop': 0.0, '

In [20]:
mutation2sample.keys()

dict_keys([('katG', 'p.Ser315Thr'), ('kasA', 'c.-39C>T'), ('fabG1', 'c.-278T>C'), ('kasA', 'p.Asp127Gly'), ('fabG1', 'c.-15C>T'), ('fabG1', 'c.-8T>C'), ('katG', 'p.Ala168Gly'), ('katG', 'c.2223A>G'), ('katG', 'c.-85C>T'), ('katG', 'p.Glu233Gln'), ('inhA', 'p.Ile194Thr'), ('ahpC', 'c.-142G>A'), ('kasA', 'p.Asp304Asn'), ('kasA', 'p.Met144Thr'), ('inhA', 'c.-154G>A'), ('katG', 'p.Ser315Gly'), ('inhA', 'c.-676G>A'), ('inhA', 'c.-40C>T'), ('katG', 'p.Tyr155Cys'), ('katG', 'p.Asn529Thr'), ('katG', 'c.2221T>C'), ('inhA', 'p.Ile21Val'), ('katG', 'p.Trp191Arg'), ('katG', 'p.Tyr608Asp'), ('katG', 'p.Met257Ile'), ('inhA', 'p.Ile228Met'), ('katG', 'p.Thr275Ala'), ('katG', 'p.Ser315Asn'), ('ahpC', 'c.-88G>A'), ('katG', 'p.Lys557Asn'), ('katG', 'p.Pro232Ala'), ('ahpC', 'p.Leu191Arg'), ('ahpC', 'c.-52C>T'), ('katG', 'p.Pro533Leu'), ('katG', 'p.Ala492Asp'), ('katG', 'p.Trp149Cys'), ('ahpC', 'c.-51G>A'), ('fabG1', 'c.-17G>T'), ('katG', 'c.-28G>T'), ('ahpC', 'c.-181T>G'), ('katG', 'p.Glu82Asp'), ('katG'

In [21]:
len(sample2mutation.keys())

17500

In [22]:
resistance_mutations[drug_of_interest]

{('fabG1', 'c.-15C>T'),
 ('fabG1', 'c.-17G>T'),
 ('fabG1', 'c.-8T>A'),
 ('fabG1', 'c.-8T>C'),
 ('fabG1', 'c.-8T>G'),
 ('inhA', 'p.Ile16Thr'),
 ('inhA', 'p.Ile194Thr'),
 ('inhA', 'p.Ile21Thr'),
 ('inhA', 'p.Ile21Val'),
 ('kasA', 'p.Asp66Asn'),
 ('kasA', 'p.Gly387Asp'),
 ('katG', 'c.-104_*2462del'),
 ('katG', 'c.-10A>C'),
 ('katG', 'c.-1136_*12019del'),
 ('katG', 'c.-11561_*8301del'),
 ('katG', 'c.-1180_1265del'),
 ('katG', 'c.-13060_*5307del'),
 ('katG', 'c.-15133_*11506del'),
 ('katG', 'c.-15408_*19120del'),
 ('katG', 'c.-1778_*1594del'),
 ('katG', 'c.-2055_*3330del'),
 ('katG', 'c.-2233_*35269del'),
 ('katG', 'c.-2717_47del'),
 ('katG', 'c.-3736_*5689del'),
 ('katG', 'c.-4345_*3617del'),
 ('katG', 'c.-4547_574del'),
 ('katG', 'c.-479_*6638del'),
 ('katG', 'c.-5102_1762del'),
 ('katG', 'c.-5249_79del'),
 ('katG', 'c.-5269_*546del'),
 ('katG', 'c.-5735_*25027del'),
 ('katG', 'c.-6512_*15260del'),
 ('katG', 'c.-653_388del'),
 ('katG', 'c.-6906_*16374del'),
 ('katG', 'c.-7242_*10095del'),

In [23]:
potential_comp_mut_stats

defaultdict(dict,
            {('ahpC', 'c.-48G>A'): {'drug': 'isoniazid',
              'gene': 'ahpC',
              'mutation': 'c.-48G>A',
              'gene_mutation': 'ahpC-c.-48G>A',
              'n_samps': 92,
              'dst_prop': 0.576,
              'dr_prop': 0.0,
              'n_lins': 17},
             ('ahpC', 'c.-51G>A'): {'drug': 'isoniazid',
              'gene': 'ahpC',
              'mutation': 'c.-51G>A',
              'gene_mutation': 'ahpC-c.-51G>A',
              'n_samps': 45,
              'dst_prop': 0.644,
              'dr_prop': 0.0,
              'n_lins': 10},
             ('ahpC', 'c.-52C>A'): {'drug': 'isoniazid',
              'gene': 'ahpC',
              'mutation': 'c.-52C>A',
              'gene_mutation': 'ahpC-c.-52C>A',
              'n_samps': 49,
              'dst_prop': 0.51,
              'dr_prop': 0.0,
              'n_lins': 6},
             ('ahpC', 'c.-52C>T'): {'drug': 'isoniazid',
              'gene': 'ahpC',
              '

In [24]:
# Add the filtered potential compensatory mutations 
# to the list of known compensatory mutations for the drug of interest
compensatory_mutations[drug_of_interest].update(potential_comp_mut_filtered)

In [25]:
# ** MAIN BIT **
# ** Go over all the samples and get the potential resistance mutations from the presence of comp mutations ** 
potential_resistance_mutations = set()
# set up sample count vectors
samps_CM = []
for s in tqdm(samples):
#     # Get the comp, res and other variants for each sample in the full sample list
    comp_var = [var for var in sample2mutation[s] if var in compensatory_mutations[drug_of_interest]]
    res_var = [var for var in sample2mutation[s] if var in resistance_mutations[drug_of_interest]]
    other_vars = [var for var in sample2mutation[s] if var not in compensatory_mutations[drug_of_interest]\
                  and var not in resistance_mutations[drug_of_interest]]

    # If there is at least one comp variant and there are no (known) resistance variants
    if len(comp_var)>0 and len(res_var)==0:
        # If there are no 'other' variants print the sample and the comp variants
        if len(other_vars)==0:
            print("Sample with at least one comp. mut. but no res. mutations:")
            print("samp:", s, "comp. mut.:", comp_var)
            
        # Store the 'other' vars as potential resistance variants
        for var in other_vars:
            potential_resistance_mutations.add(var)


100%|██████████| 32735/32735 [00:00<00:00, 318875.70it/s]

Sample with at least one comp. mut. but no res. mutations:
samp: ERR2516197 comp. mut.: [('ahpC', 'p.Leu191Arg')]
Sample with at least one comp. mut. but no res. mutations:
samp: ERR2179658 comp. mut.: [('ahpC', 'c.-81C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SAMN03246450 comp. mut.: [('ahpC', 'c.-52C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SAMN03246659 comp. mut.: [('ahpC', 'c.-81C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SRR1723651 comp. mut.: [('ahpC', 'c.-81C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SRR6824452 comp. mut.: [('ahpC', 'c.-48G>A')]
Sample with at least one comp. mut. but no res. mutations:
samp: SRR6824287 comp. mut.: [('ahpC', 'c.-81C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SRR8651572 comp. mut.: [('ahpC', 'c.-48G>A')]
Sample with at least one comp. mut. but no res. mutations:
samp: ERR216923 comp. mut.: [('ahpC', 'c.-48G>




In [26]:
potential_resistance_mutations

{('ahpC', 'c.-142G>A'),
 ('ahpC', 'c.-256delT'),
 ('ahpC', 'c.-88G>A'),
 ('ahpC', 'p.Leu191Pro'),
 ('fabG1', 'c.-102G>A'),
 ('fabG1', 'c.-179C>T'),
 ('fabG1', 'c.-223_-222insGC'),
 ('inhA', 'c.-154G>A'),
 ('inhA', 'p.Arg27Trp'),
 ('kasA', 'c.-39C>T'),
 ('kasA', 'p.His253Tyr'),
 ('kasA', 'p.Val142Ile'),
 ('kasA', 'p.Val192Ile'),
 ('katG', 'c.-1T>C'),
 ('katG', 'c.-634_489del'),
 ('katG', 'c.-6579_696del'),
 ('katG',
  'c.1042_1128delNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNA'),
 ('katG', 'c.1120_*453del'),
 ('katG', 'c.1414_1440dupCTAGTTTCGACCGCATGGGCGGCGGCG'),
 ('katG', 'c.1438_1440delGCG'),
 ('katG', 'c.2047_*24266del'),
 ('katG', 'c.2223A>C'),
 ('katG', 'c.2223A>G'),
 ('katG', 'c.680_685delTGATCT'),
 ('katG', 'c.718_*16215del'),
 ('katG', 'c.984_998delGGACAACAGTTTCCT'),
 ('katG', 'p.Ala122Val'),
 ('katG', 'p.Ala162Val'),
 ('katG', 'p.Ala312Glu'),
 ('katG', 'p.Ala312Val'),
 ('katG', 'p.Ala411Asp'),
 ('katG', 'p.Ala476Glu'),
 ('katG', 'p.Ala

In [27]:
# Filter the potential resistance variants in the same way as filtering the potential comp. variants
potential_res_mut_filtered, potential_res_mut_stats = filter_vars(potential_resistance_mutations, mutation2sample, meta_dict, drug_of_interest)

In [28]:
potential_res_mut_filtered

{('katG', 'c.2223A>G'),
 ('katG', 'p.Arg484His'),
 ('katG', 'p.Asn655Asp'),
 ('katG', 'p.Asp142Gly'),
 ('katG', 'p.Asp189Asn'),
 ('katG', 'p.Asp419Tyr'),
 ('katG', 'p.Gln88Pro'),
 ('katG', 'p.Gly169Ser'),
 ('katG', 'p.Gly299Ser'),
 ('katG', 'p.Pro232Ser'),
 ('katG', 'p.Trp161Cys'),
 ('katG', 'p.Trp505*'),
 ('katG', 'p.Trp90Arg'),
 ('katG', 'p.Tyr413Cys')}

In [29]:
# WRITE FILES

# Make a dict of samps and metadata for samps with the potential res. mutations

potential_res_mut_dict = {}

for var in potential_res_mut_filtered:
    for samp in mutation2sample[var]:
        potential_res_mut_dict[samp] = {'wgs_id': samp,
                                        'drug': drug_of_interest, 
                                        'gene': var[0], 
                                        'mutation': var[1],
                                        'gene_mutation': var[0] + '-' + var[1], 
                                        'main_lineage': meta_dict[samp]['main_lineage'], 
                                        'sublin':meta_dict[samp]['sublin'], 
                                        'country_code': meta_dict[samp]['country_code'], 
                                        'drtype': meta_dict[samp]['drtype'],
                                        'dst': meta_dict[samp][drug_of_interest]}

In [30]:
potential_res_mut_filtered

{('katG', 'c.2223A>G'),
 ('katG', 'p.Arg484His'),
 ('katG', 'p.Asn655Asp'),
 ('katG', 'p.Asp142Gly'),
 ('katG', 'p.Asp189Asn'),
 ('katG', 'p.Asp419Tyr'),
 ('katG', 'p.Gln88Pro'),
 ('katG', 'p.Gly169Ser'),
 ('katG', 'p.Gly299Ser'),
 ('katG', 'p.Pro232Ser'),
 ('katG', 'p.Trp161Cys'),
 ('katG', 'p.Trp505*'),
 ('katG', 'p.Trp90Arg'),
 ('katG', 'p.Tyr413Cys')}

In [31]:
potential_res_mut_dict

{'SRR5709950': {'wgs_id': 'SRR5709950',
  'drug': 'isoniazid',
  'gene': 'katG',
  'mutation': 'p.Gln88Pro',
  'gene_mutation': 'katG-p.Gln88Pro',
  'main_lineage': 'lineage2',
  'sublin': 'lineage2.2.1',
  'country_code': 'th',
  'drtype': 'MDR-TB',
  'dst': '1'},
 'SRR8651591': {'wgs_id': 'SRR8651591',
  'drug': 'isoniazid',
  'gene': 'katG',
  'mutation': 'p.Gln88Pro',
  'gene_mutation': 'katG-p.Gln88Pro',
  'main_lineage': 'lineage2',
  'sublin': 'lineage2.2.1.1',
  'country_code': 'cn',
  'drtype': 'MDR-TB',
  'dst': '1'},
 'SRR5341328': {'wgs_id': 'SRR5341328',
  'drug': 'isoniazid',
  'gene': 'katG',
  'mutation': 'p.Gln88Pro',
  'gene_mutation': 'katG-p.Gln88Pro',
  'main_lineage': 'lineage3',
  'sublin': 'lineage3',
  'country_code': 'in',
  'drtype': 'Pre-XDR-TB',
  'dst': 'NA'},
 'ERR2516654': {'wgs_id': 'ERR2516654',
  'drug': 'isoniazid',
  'gene': 'katG',
  'mutation': 'p.Asp419Tyr',
  'gene_mutation': 'katG-p.Asp419Tyr',
  'main_lineage': 'lineage2',
  'sublin': 'lineage

In [32]:
# SUMMARY
# SUMMARY
# SUMMARY
# SUMMARY
# SUMMARY

In [36]:
potential_comp_mut_filtered

{('ahpC', 'c.-48G>A'),
 ('ahpC', 'c.-51G>A'),
 ('ahpC', 'c.-52C>A'),
 ('ahpC', 'c.-52C>T'),
 ('ahpC', 'c.-54C>T'),
 ('ahpC', 'c.-57C>T'),
 ('ahpC', 'c.-72C>T'),
 ('ahpC', 'p.Gly32Asp')}

In [35]:
potential_res_mut_filtered

{('katG', 'c.2223A>G'),
 ('katG', 'p.Arg484His'),
 ('katG', 'p.Asn655Asp'),
 ('katG', 'p.Asp142Gly'),
 ('katG', 'p.Asp189Asn'),
 ('katG', 'p.Asp419Tyr'),
 ('katG', 'p.Gln88Pro'),
 ('katG', 'p.Gly169Ser'),
 ('katG', 'p.Gly299Ser'),
 ('katG', 'p.Pro232Ser'),
 ('katG', 'p.Trp161Cys'),
 ('katG', 'p.Trp505*'),
 ('katG', 'p.Trp90Arg'),
 ('katG', 'p.Tyr413Cys')}

In [37]:
potential_res_mut_dict

{'SRR5709950': {'wgs_id': 'SRR5709950',
  'drug': 'isoniazid',
  'gene': 'katG',
  'mutation': 'p.Gln88Pro',
  'gene_mutation': 'katG-p.Gln88Pro',
  'main_lineage': 'lineage2',
  'sublin': 'lineage2.2.1',
  'country_code': 'th',
  'drtype': 'MDR-TB',
  'dst': '1'},
 'SRR8651591': {'wgs_id': 'SRR8651591',
  'drug': 'isoniazid',
  'gene': 'katG',
  'mutation': 'p.Gln88Pro',
  'gene_mutation': 'katG-p.Gln88Pro',
  'main_lineage': 'lineage2',
  'sublin': 'lineage2.2.1.1',
  'country_code': 'cn',
  'drtype': 'MDR-TB',
  'dst': '1'},
 'SRR5341328': {'wgs_id': 'SRR5341328',
  'drug': 'isoniazid',
  'gene': 'katG',
  'mutation': 'p.Gln88Pro',
  'gene_mutation': 'katG-p.Gln88Pro',
  'main_lineage': 'lineage3',
  'sublin': 'lineage3',
  'country_code': 'in',
  'drtype': 'Pre-XDR-TB',
  'dst': 'NA'},
 'ERR2516654': {'wgs_id': 'ERR2516654',
  'drug': 'isoniazid',
  'gene': 'katG',
  'mutation': 'p.Asp419Tyr',
  'gene_mutation': 'katG-p.Asp419Tyr',
  'main_lineage': 'lineage2',
  'sublin': 'lineage

In [45]:
compensatory_mutations[drug_of_interest]

{('ahpC', 'c.-20C>T'),
 ('ahpC', 'c.-39C>T'),
 ('ahpC', 'c.-44T>A'),
 ('ahpC', 'c.-48G>A'),
 ('ahpC', 'c.-48G>T'),
 ('ahpC', 'c.-49T>G'),
 ('ahpC', 'c.-4A>G'),
 ('ahpC', 'c.-51G>A'),
 ('ahpC', 'c.-51G>T'),
 ('ahpC', 'c.-52C>A'),
 ('ahpC', 'c.-52C>T'),
 ('ahpC', 'c.-54C>T'),
 ('ahpC', 'c.-57C>T'),
 ('ahpC', 'c.-5G>A'),
 ('ahpC', 'c.-66G>A'),
 ('ahpC', 'c.-72C>T'),
 ('ahpC', 'c.-74G>A'),
 ('ahpC', 'c.-81C>T'),
 ('ahpC', 'p.Asp33Asn'),
 ('ahpC', 'p.Asp73His'),
 ('ahpC', 'p.Glu76Lys'),
 ('ahpC', 'p.Gly32Asp'),
 ('ahpC', 'p.Leu191Arg'),
 ('ahpC', 'p.Phe10Ile'),
 ('ahpC', 'p.Pro2Ser'),
 ('ahpC', 'p.Thr5Ile')}

In [48]:
resistance_mutations

defaultdict(set,
            {'isoniazid': {('fabG1', 'c.-15C>T'),
              ('fabG1', 'c.-17G>T'),
              ('fabG1', 'c.-8T>A'),
              ('fabG1', 'c.-8T>C'),
              ('fabG1', 'c.-8T>G'),
              ('inhA', 'p.Ile16Thr'),
              ('inhA', 'p.Ile194Thr'),
              ('inhA', 'p.Ile21Thr'),
              ('inhA', 'p.Ile21Val'),
              ('kasA', 'p.Asp66Asn'),
              ('kasA', 'p.Gly387Asp'),
              ('katG', 'c.-104_*2462del'),
              ('katG', 'c.-10A>C'),
              ('katG', 'c.-1136_*12019del'),
              ('katG', 'c.-11561_*8301del'),
              ('katG', 'c.-1180_1265del'),
              ('katG', 'c.-13060_*5307del'),
              ('katG', 'c.-15133_*11506del'),
              ('katG', 'c.-15408_*19120del'),
              ('katG', 'c.-1778_*1594del'),
              ('katG', 'c.-2055_*3330del'),
              ('katG', 'c.-2233_*35269del'),
              ('katG', 'c.-2717_47del'),
              ('katG', 'c.-3736_*56

In [68]:
print()
print("RM = any resistance mutation \n \
      KRM = known RM \n \
      PRM = potential RM \n \
      CM = any compensatory mutation \n \
      KCM = known CM \n \
      PCM = potential CM \n")

# samps with any CM
samps_CM = []
for samp in sample2mutation:
    if any(var in sample2mutation[samp] for var in compensatory_mutations[drug_of_interest]):
        samps_CM.append(samp)

# samps with CM and any KRM
samps_CM_and_KRM = []
for samp in samps_CM:
    if any(var in sample2mutation[samp] for var in resistance_mutations[drug_of_interest]):
        samps_CM_and_KRM.append(samp)

# samps with CM and no KRM
samps_CM_and_no_KRM = []
for samp in samps_CM:
    if not any(var in sample2mutation[samp] for var in resistance_mutations[drug_of_interest]):
        samps_CM_and_no_KRM.append(samp)

# samps with CM, with no KRM and PKM
samps_CM_and_no_KRM_and_PRM = []
for samp in samps_CM_and_no_KRM:
    if any(var in sample2mutation[samp] for var in potential_res_mut_filtered):
        samps_CM_and_no_KRM_and_PRM.append(samp)

# samps with CM, with no KRM and no PRM
samps_CM_and_no_KRM_and_no_PRM = []
for samp in samps_CM_and_no_KRM:
    if not any(var in sample2mutation[samp] for var in potential_res_mut_filtered):
        samps_CM_and_no_KRM_and_no_PRM.append(samp)

summary_dict = {"n_total_CM": len(compensatory_mutations[drug_of_interest]),
                "n_PCM": len(potential_comp_mut_filtered),
                "n_KCM": len(compensatory_mutations[drug_of_interest])-len(potential_comp_mut_filtered),
                "n_PRM": len(potential_res_mut_filtered), 
                "n_samps_PRM": len(potential_res_mut_dict), 
                "n_samps_CM": len(samps_CM), 
                "n_samps_CM_and_KRM": len(samps_CM_and_KRM),
                "n_samps_CM_and_no_KRM": len(samps_CM_and_no_KRM), 
                "n_samps_CM_and_no_KRM_and_PRM": len(samps_CM_and_no_KRM_and_PRM), 
                "n_samps_CM_and_no_KRM_and_no_PRM": len(samps_CM_and_no_KRM_and_no_PRM)}

print()
print("summary for", drug_of_interest)
for x in summary_dict:
    print(x, summary_dict[x])
print()



RM = any resistance mutation 
       KRM = known RM 
       PRM = potential RM 
       CM = any compensatory mutation 
       KCM = known CM 
       PCM = potential CM 


summary for isoniazid
n_total_CM 26
n_PCM 8
n_KCM 18
n_PRM 14
n_samps_PRM 74
n_samps_CM 515
n_samps_CM_and_KRM 343
n_samps_CM_and_no_KRM 172
n_samps_CM_and_no_KRM_and_PRM 25
n_samps_CM_and_no_KRM_and_no_PRM 147

['ERR2516197', 'ERR2516199', 'ERR1035291', 'ERR1034619', 'SRR6152960', 'SRR6397665', 'SRR6397630', 'SRR6398049', 'ERR2707193', 'ERR1199124', 'SRR7517806', 'SRR2024947', 'SRR2024988', 'ERR2179666', 'ERR2179658', 'ERR2179675', 'ERR2179677', 'SRR1723714', 'SRR1723776', 'SRR1723460', 'SAMN03246351', 'SRR1723510', 'SAMN03246450', 'SAMN03246659', 'SAMN03246286', 'SAMN03246584', 'SRR1723811', 'SAMN03246632', 'SRR1723437', 'SAMN03246321', 'SRR1723651', 'SRR1723932', 'SRR1723906', 'SRR960978', 'SRR6824412', 'SRR6824612', 'SRR6824473', 'SRR6824294', 'SRR6824295', 'SRR6824303', 'SRR6824677', 'SRR6824374', 'SRR6824430', 

In [50]:
515-343

172

In [None]:
# SUMMARY
# SUMMARY
# SUMMARY
# SUMMARY
# SUMMARY

In [None]:
# --------------------------------------------------------------------------------------------------------------

In [None]:
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS

In [None]:
co_gene = 'fabG1'
fabg_dr_mutations = {(var['Gene'], var['Mutation']) \
                     for var in tbdb_dict[drug_of_interest] \
                     if var['Gene'] == co_gene}

In [None]:
# ----------------------------------------------------------------------
# Co-occurrence with fabG1 in samples with potential INH res. mutations
# ----------------------------------------------------------------------
# potential_res_mut_filtered
# mutation2sample
# sample2mutation
# meta_dict
# drug_of_interest

co_gene_variants = []
for samp in potential_res_mut_dict:
    variants = sample2mutation[samp]
    co_gene_variants.append([v for v in variants if v in fabg_dr_mutations])

co_gene_variants = flat_list(co_gene_variants)

fabg_count = dict(Counter(co_gene_variants))

print("n samples having potential new resistance mutations which also have fabG1 DR mutations:")
print(fabg_count)
print("proportions:")
print({count: round(fabg_count[count]/len(potential_res_mut_dict), 3) for count in fabg_count})

In [None]:
# --------------------------------------------------------------------------------------------
# Compare to p.Ser315Thr - get proportion of samples with p.Ser315Thr that don't have a fabG1
# --------------------------------------------------------------------------------------------

katg_ser315thr = ('katG', 'p.Ser315Thr')
ser315thr_samps = []
Ser315Thr_fabg_samps = []
Ser315Thr_no_fabg_samps = []
ser315thr_comp_mut_samps = []
for samp in sample2mutation:
    mutations = sample2mutation[samp]

    # Total n samps with Ser315Thr
    if (katg_ser315thr in mutations):
        ser315thr_samps.append(samp)

    if (katg_ser315thr in mutations) and (any(var in mutations for var in fabg_dr_mutations)):
           Ser315Thr_fabg_samps.append(samp)

    if (katg_ser315thr in mutations) and not (any(var in mutations for var in fabg_dr_mutations)):
        Ser315Thr_no_fabg_samps.append(samp)
        
    # How many of the S315 samples have comp mutations?
    if (katg_ser315thr in mutations) and (any(var in mutations for var in compensatory_mutations['isoniazid'])):
        ser315thr_comp_mut_samps.append(samp)

Ser315Thr_fabg_prop = round(len(Ser315Thr_fabg_samps) / len(ser315thr_samps), 3)

print("total n samps with a Ser315Thr mutation: ", len(ser315thr_samps))
print("n samples with p.Ser315Thr and a fabG1 DR mutation: ", len(Ser315Thr_fabg_samps))
print("n samples with p.Ser315Thr and no fabG1 DR mutations: ", len(Ser315Thr_no_fabg_samps))
print("proportion: ", Ser315Thr_fabg_prop) # 0.158
print("n samps with a p.Ser315Thr mutation and a comp mutation: ", len(ser315thr_comp_mut_samps) )
print("proportion: ", round(len(ser315thr_comp_mut_samps) / len(ser315thr_samps), 3))

# 2 measures - fitness cost and levels of res
# fabg1 gives extra resistance
# therefore how 'resistant' are the new ones?

In [None]:
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS

In [None]:
# --------------------------------------------------------------------------------------------------------------

In [None]:
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING

In [None]:
# {
#     (('rpoB','p.Ser450Leu')): 100
#     (('rpoB','p.Ser450Leu'),('rpoC','p.Val187Leu')): 10,
#     (('rpoB','p.Ser450Leu'),('rpoC','p.Thr300Ala')): 12,
# }

In [None]:
sample2mutation

In [None]:
from collections import Counter

In [None]:
# rif_freq_table = dict(Counter([sample2mutation[samp] for samp in sample2mutation]))

rif_freq_table = dict(Counter([str(tuple(sample2mutation[samp])) for samp in sample2mutation]))


In [None]:
# # Directly from dictionary
# with open('results/rif_freq_table.json', 'w') as outfile:
#     outfile.write(json.dumps(rif_freq_table))

In [None]:
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING