In [1]:
%cd ~/comp_mut

/mnt/storage7/gary/comp_mut


In [2]:
%reset -f

In [3]:
import json
from collections import defaultdict, Counter
import argparse
import os
from tqdm import tqdm
import sys
import csv
import pathogenprofiler as pp
import tbprofiler
from csv import DictReader
from collections import Counter
import requests
from contextlib import closing
import re
from python_scripts.utils import *

In [4]:
def get_vars_exclude(vars_exclude_file):

    # URL below is the results of all Fst = 1 variants from https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-020-00817-3
    fst_results_url = 'https://raw.githubusercontent.com/GaryNapier/tb-lineages/main/fst_results_clean_fst_1_for_paper.csv'
    # See https://www.codegrepper.com/code-examples/python/how+to+read+a+csv+file+from+a+url+with+python for pulling data from url
    with closing(requests.get(fst_results_url, stream=True)) as r:
        f = (line.decode('utf-8') for line in r.iter_lines())
        fst_dict = csv_to_dict_multi(f)
    
    lin_specific_variants = []
    for gene in fst_dict:
        for var in fst_dict[gene]:
            lin_specific_variants.append( tuple( [ gene, reformat_mutations(var['aa_pos']) ] ) )

    # Read in variants to be excluded
    vars_exclude = []
    for l in open(vars_exclude_file):
        vars_exclude.append(tuple(l.strip().split(',')))

    # Concat
    vars_exclude = vars_exclude + lin_specific_variants 
    return vars_exclude

def get_counts(meta,samples,col):
    return dict(Counter([meta[s][col] for s in samples]))

def get_meta_proportion(meta,samples,column,targets):
    tmp = get_counts(meta,samples,column)
    target_count = sum([tmp.get(c,0) for c in targets])
    return round(target_count/sum(tmp.values()), 3)

def filter_vars(variants, mutation2sample, meta_dict, drug_of_interest):

    # Get stats for each variant
    stats_dict = defaultdict(dict)
    variants_passed = set()
    # for var in variants:
    for var in variants:

        # Get proportion or number of samples (in the case of lineage) per potential mutation
        samps = mutation2sample[var]

        if len(samps)<3: continue

        dst_proportion = get_meta_proportion(meta_dict,samps,drug_of_interest,['1'])
        sensitive_geno_proportion = get_meta_proportion(meta_dict,samps,'drtype',['Sensitive'])
        num_lins = len(set(resolve_lineages(get_counts(meta_dict,samps,'sublin'))))

        # Filter and add to dict
        if dst_proportion >= 0.5 and sensitive_geno_proportion <= 0.5 and num_lins > 1:
                
            variants_passed.add(var)

            stats_dict[var] = {'drug': drug_of_interest, 
                               'gene': var[0], 
                               'mutation': var[1],
                               'gene_mutation': var[0] + '-' + var[1],
                               'n_samps' : len(samps), 
                               'dst_prop': dst_proportion, 
                               'dr_prop': sensitive_geno_proportion, 
                               'n_lins': num_lins}

    return (variants_passed, stats_dict)

In [5]:
drug_of_interest = 'isoniazid'

In [6]:
# FILES

# potential_comp_mut_file = args.potential_comp_mut_file
# metadata_file = args.metadata_file
# tbdb_file = args.tbdb_file
# drtypes_file = args.drtypes_file
# known_comp_mut_file = args.known_comp_mut_file
# tbprofiler_results_dir = args.tbprofiler_results_dir
# vars_exclude_file = args.vars_exclude_file
# potential_res_mut_outfile = args.potential_res_mut_outfile

# FILES

# potential_comp_mut_file = "results/%s_novel_comp_mut_model_results.csv" % drug_of_interest
PCM_file = "results/%s_novel_comp_mut_merged.csv" % drug_of_interest
metadata_file = "../metadata/tb_data_18_02_2021.csv"
tbdb_file = "../tbdb/tbdb.csv"
drtypes_file = "../pipeline/db/dr_types.json"
known_comp_mut_file = '../pipeline/db/compensatory_mutations.csv'
tbprofiler_results_dir = '/mnt/storage7/jody/tb_ena/tbprofiler/freebayes/results/'
vars_exclude_file = 'metadata/var_exclude_comp_mut.csv'
PRM_stats_file = 'results/potential_res_mut_stats.csv'
PRM_samples_file = 'results/potential_res_mut_samps.csv'


In [7]:
# VARIABLES

# suffix = args.suffix
# drug_of_interest = args.drug_of_interest

suffix = ".results.json"

In [8]:
# Read in potential CM results file
with open(PCM_file, 'r') as f:
    reader = csv.DictReader(f)        
    PCM_list = []
    for row in reader:
        PCM_list.append((row['gene'], row['change']))

In [9]:
# Read in metadata
with open(metadata_file) as mf:
    meta_dict = csv_to_dict(mf)
    
# Pull samples
samples = list(meta_dict.keys())

In [10]:
# Read in tbdb file
with open(tbdb_file, 'r') as f:
    tbdb_dict = csv_to_dict_multi(f, 'Drug')
    
# krm = [(var['Gene'], var['Mutation']) for var in tbdb_dict[drug_of_interest]]
# print(len(krm))
# print(krm[0:5])

In [11]:
# Read in DR types from json
standardise_drtype = json.load(open(drtypes_file))

In [12]:
# Get known compensatory mutations of interest
KCM = defaultdict(set)
for row in csv.DictReader(open(known_comp_mut_file)):
    if row['Drug'] != drug_of_interest: continue
    KCM[row['Drug']].add((row['Gene'],row['Mutation']))

################################################################
n_KCM = len(KCM[drug_of_interest])
################################################################

In [13]:
# Read in variants to exclude
vars_exclude = get_vars_exclude(vars_exclude_file)
vars_exclude

[('katG', 'p.Arg463Leu'),
 ('katG', 'p.Trp300Cys'),
 ('pstP', 'p.Asp74Ala'),
 ('pstP', None),
 ('pstP', None),
 ('pstP', 'p.Ala205Val'),
 ('pstP', 'p.Ala44Val'),
 ('pstP', 'p.Arg283His'),
 ('Rv0021c', 'p.Asp179His'),
 ('Rv0021c', None),
 ('Rv0021c', None),
 ('Rv0061c', None),
 ('Rv0061c', 'p.Thr39Ala'),
 ('celA1', 'p.His38Asp'),
 ('celA1', 'p.Val76Gly'),
 ('celA1-Rv0063', None),
 ('Rv0063', 'p.Ser418Pro'),
 ('Rv0063', None),
 ('Rv0063', None),
 ('Rv0063', 'p.Ala238Ser'),
 ('Rv0063', 'p.Pro162Ala'),
 ('Rv0063', 'p.Ala155Thr'),
 ('Rv0063', 'p.Gln403*'),
 ('Rv0067c', 'p.Glu154Asp'),
 ('Rv0067c', 'p.Pro3Leu'),
 ('Rv0067c', 'p.Leu132Val'),
 ('Rv0067c', None),
 ('Rv0090', 'p.Ala163Pro'),
 ('nrp', None),
 ('nrp', 'p.Ser2432Asn'),
 ('nrp', 'p.Asp551Asn'),
 ('nrp', 'p.Cys2301Gly'),
 ('nrp', 'p.Arg379Cys'),
 ('nrp', 'p.Gly779Asp'),
 ('nrp', 'p.Leu655Met'),
 ('nrp', 'p.Ala812Thr'),
 ('nrp', 'p.Ala1214Val'),
 ('nrp', 'p.Pro1291Leu'),
 ('nrp', None),
 ('nrp', 'p.Leu1280Phe'),
 ('nrp', 'p.Ala2468Val

In [14]:
# Find genes associated with drug of interest
genes = set()
for var in tbdb_dict[drug_of_interest]:
    genes.add(var['Gene'])

# Concat with genes from known and potential resistance mutations
genes = genes.union(set([var[0] for var in KCM[drug_of_interest]]))
genes = genes.union(set([var[0] for var in PCM_list]))

genes

{'ahpC', 'fabG1', 'inhA', 'kasA', 'katG'}

In [15]:
# Load mutation data using ('gene','change') as keys
mutation2sample = defaultdict(set)
sample2mutation = defaultdict(set)
resistance_mutations = defaultdict(set)
for s in tqdm(samples):
    file = "%s/%s%s" % (tbprofiler_results_dir, s, suffix)
    if os.path.isfile(file):
        data = json.load(open(file))
        # Skip mixed samps
        if ';' in data['sublin']: continue

        # Update metadata
        meta_dict[s]['drtype'] = standardise_drtype[data['drtype']]
        meta_dict[s]['sublin'] = data['sublin']

        # MAKE SURE THE FOR LOOP BELOW IS INDENTED IN LINE WITH if os.path.isfile(file):
        # Otherwise adds sample s to mutation2sample etc

        for var in data['dr_variants'] + data['other_variants']:
            if var['gene'] not in genes: continue
            if var['freq'] < 0.7: continue
            if var['type']=='synonymous_variant': continue
            if (var['gene'], var['change']) in vars_exclude: continue
            key = (var['gene'],var['change'])
            mutation2sample[key].add(s)
            sample2mutation[s].add(key)
            if "drugs" in var:
                for d in var["drugs"]:
                    if d["drug"] not in drug_of_interest: continue
                    if key in KCM[d["drug"]] or key in PCM_list: continue
                    resistance_mutations[d["drug"]].add(key)

100%|██████████| 32735/32735 [00:14<00:00, 2196.45it/s]


In [16]:
# TESTING
samp = 'ERR2864254'

file = "%s/%s%s" % (tbprofiler_results_dir, samp, suffix)

data = json.load(open(file))

# for var in data["other_variants"]:
#     print(var['change'])
#     print(var['type'])
#     print('---')


In [17]:
# Classify potential compensatory mutations and filter 
# GLM models (filter_novel_comp_mut.R) is only first step in identifying 'interesting' compensatory mutations
# Need to check against tbprofiler results for each mutation 
# e.g. if the mutation is lineage specific, then filter out
# First remove PCMs that are in the KCM list
PCM_list = [var for var in PCM_list \
                           if var not in KCM[drug_of_interest]]
########################################################
n_PCM_before_filtering = len(PCM_list) 
########################################################
PCM_filtered, PCM_stats = filter_vars(PCM_list, mutation2sample, meta_dict, drug_of_interest)

In [18]:
print(" --- list of potential comp mutations after filtering --- ")
print(PCM_filtered)
print(" --- stats: --- ")
print(PCM_stats)
print()

 --- list of potential comp mutations after filtering --- 
{('ahpC', 'c.-52C>A'), ('ahpC', 'c.-51G>A'), ('ahpC', 'c.-72C>T'), ('ahpC', 'p.Gly32Asp')}
 --- stats: --- 
defaultdict(<class 'dict'>, {('ahpC', 'c.-51G>A'): {'drug': 'isoniazid', 'gene': 'ahpC', 'mutation': 'c.-51G>A', 'gene_mutation': 'ahpC-c.-51G>A', 'n_samps': 45, 'dst_prop': 0.644, 'dr_prop': 0.0, 'n_lins': 10}, ('ahpC', 'c.-52C>A'): {'drug': 'isoniazid', 'gene': 'ahpC', 'mutation': 'c.-52C>A', 'gene_mutation': 'ahpC-c.-52C>A', 'n_samps': 49, 'dst_prop': 0.51, 'dr_prop': 0.0, 'n_lins': 6}, ('ahpC', 'c.-72C>T'): {'drug': 'isoniazid', 'gene': 'ahpC', 'mutation': 'c.-72C>T', 'gene_mutation': 'ahpC-c.-72C>T', 'n_samps': 36, 'dst_prop': 0.5, 'dr_prop': 0.0, 'n_lins': 10}, ('ahpC', 'p.Gly32Asp'): {'drug': 'isoniazid', 'gene': 'ahpC', 'mutation': 'p.Gly32Asp', 'gene_mutation': 'ahpC-p.Gly32Asp', 'n_samps': 10, 'dst_prop': 0.7, 'dr_prop': 0.0, 'n_lins': 2}})



In [19]:
# Add the filtered potential compensatory mutations 
# to the list of known compensatory mutations for the drug of interest
# KCM[drug_of_interest] + PCM_filtered
CM = KCM[drug_of_interest].union(PCM_filtered)
############################################################################
n_CM_after_filtering = len(CM)
############################################################################

In [20]:
# ** MAIN BIT **
# ** Go over all the samples and get the potential resistance mutations from the presence of comp mutations ** 
PRM = set()
# set up sample count vectors
samps_CM = []
for s in tqdm(samples):
#     # Get the comp, res and other variants for each sample in the full sample list
#     comp_var = [var for var in sample2mutation[s] if var in compensatory_mutations[drug_of_interest]]
    comp_var = [var for var in sample2mutation[s] if var in CM]
    res_var = [var for var in sample2mutation[s] if var in resistance_mutations[drug_of_interest]]
#     other_vars = [var for var in sample2mutation[s] if var not in compensatory_mutations[drug_of_interest]\
    other_vars = [var for var in sample2mutation[s] if var not in CM\
                  and var not in resistance_mutations[drug_of_interest]]

    # If there is at least one comp variant and there are no (known) resistance variants
    if len(comp_var)>0 and len(res_var)==0:
        # If there are no 'other' variants print the sample and the comp variants
        if len(other_vars)==0:
            print("Sample with at least one comp. mut. but no res. mutations:")
            print("samp:", s, "comp. mut.:", comp_var)
            
        # Store the 'other' vars as potential resistance variants
        for var in other_vars:
            PRM.add(var)


100%|██████████| 32735/32735 [00:00<00:00, 332220.79it/s]

Sample with at least one comp. mut. but no res. mutations:
samp: ERR2516197 comp. mut.: [('ahpC', 'p.Leu191Arg')]
Sample with at least one comp. mut. but no res. mutations:
samp: ERR2179658 comp. mut.: [('ahpC', 'c.-81C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SAMN03246450 comp. mut.: [('ahpC', 'c.-52C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SAMN03246659 comp. mut.: [('ahpC', 'c.-81C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SRR1723651 comp. mut.: [('ahpC', 'c.-81C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SRR6824452 comp. mut.: [('ahpC', 'c.-48G>A')]
Sample with at least one comp. mut. but no res. mutations:
samp: SRR6824287 comp. mut.: [('ahpC', 'c.-81C>T')]
Sample with at least one comp. mut. but no res. mutations:
samp: SRR8651572 comp. mut.: [('ahpC', 'c.-48G>A')]
Sample with at least one comp. mut. but no res. mutations:
samp: ERR216923 comp. mut.: [('ahpC', 'c.-48G>




In [21]:
# Filter the potential resistance variants in the same way as filtering the potential comp. variants
PRM_filtered, PRM_stats = filter_vars(PRM, mutation2sample, meta_dict, drug_of_interest)

In [22]:
# WRITE FILES

# Make a dict of samps and metadata for samps with the potential res. mutations

PRM_dict = {}

for var in PRM_filtered:
    for samp in mutation2sample[var]:
        PRM_dict[samp] = {'wgs_id': samp,
                                        'drug': drug_of_interest, 
                                        'gene': var[0], 
                                        'mutation': var[1],
                                        'gene_mutation': var[0] + '-' + var[1], 
                                        'main_lineage': meta_dict[samp]['main_lineage'], 
                                        'sublin':meta_dict[samp]['sublin'], 
                                        'country_code': meta_dict[samp]['country_code'], 
                                        'drtype': meta_dict[samp]['drtype'],
                                        'dst': meta_dict[samp][drug_of_interest]}

In [23]:
# SUMMARY vvv
# SUMMARY vvv
# SUMMARY vvv
# SUMMARY vvv
# SUMMARY vvv

In [24]:
# print()
# print("RM = any resistance mutation \n \
#       KRM = known RM \n \
#       PRM = potential RM \n \
#       CM = any compensatory mutation \n \
#       KCM = known CM \n \
#       PCM = potential CM \n")

# # samps with any CM
# samps_CM = []
# for samp in sample2mutation:
#     if any(var in sample2mutation[samp] for var in CM):
#         samps_CM.append(samp)

# # samps with CM and any KRM
# samps_CM_and_KRM = []
# for samp in samps_CM:
#     if any(var in sample2mutation[samp] for var in resistance_mutations[drug_of_interest]):
#         samps_CM_and_KRM.append(samp)

# # samps with CM and no KRM
# samps_CM_and_no_KRM = []
# for samp in samps_CM:
#     if not any(var in sample2mutation[samp] for var in resistance_mutations[drug_of_interest]):
#         samps_CM_and_no_KRM.append(samp)

# # samps with CM, with no KRM and PRM
# samps_CM_and_no_KRM_and_PRM = []
# for samp in samps_CM_and_no_KRM:
#     if any(var in sample2mutation[samp] for var in PRM_filtered):
#         samps_CM_and_no_KRM_and_PRM.append(samp)

# # samps with CM, with no KRM and no PRM
# samps_CM_and_no_KRM_and_no_PRM = []
# for samp in samps_CM_and_no_KRM:
#     if not any(var in sample2mutation[samp] for var in PRM_filtered):
#         samps_CM_and_no_KRM_and_no_PRM.append(samp)


# summary_dict = {"n_total_CM": len(CM),
#                 "n_PCM": len(PCM_filtered),
#                 "n_KCM": len(CM)-len(PCM_filtered),
#                 "n_PRM": len(PRM_filtered), 
#                 "n_samps_PRM": len(PRM_dict), 
#                 "n_samps_CM": len(samps_CM), 
#                 "n_samps_CM_and_KRM": len(samps_CM_and_KRM),
#                 "n_samps_CM_and_no_KRM": len(samps_CM_and_no_KRM), 
#                 "n_samps_CM_and_no_KRM_and_PRM": len(samps_CM_and_no_KRM_and_PRM), 
#                 "n_samps_CM_and_no_KRM_and_no_PRM": len(samps_CM_and_no_KRM_and_no_PRM)}

# print()
# print("summary for", drug_of_interest)
# for x in summary_dict:
#     print(x, summary_dict[x])
# print()


In [29]:
# ----------- BINARY TABLE SUMMARY --------------

binary_table = {}

# CM
for samp in sample2mutation:
#     CM_vars = [var for var in sample2mutation[samp] if var in compensatory_mutations[drug_of_interest]]
    CM_vars = [var for var in sample2mutation[samp] if var in CM]
    KRM_vars = [var for var in sample2mutation[samp] if var in resistance_mutations[drug_of_interest]]
    PRM_vars = [var for var in sample2mutation[samp] if var in PRM_filtered]
    other_vars = [var for var in sample2mutation[samp]]
    
#     if len(CM_vars) > 0:
    binary_table[samp] = {'CM': CM_vars, 'KRM': KRM_vars, 'PRM': PRM_vars, 'other_vars': other_vars}

# print(len(binary_table))
# for samp in binary_table:
#     print(samp)
#     print(binary_table[samp]['CM'])

# vars

# CM
CM_in_samps = set(flat_list([binary_table[samp]['CM'] for samp in binary_table]))
KCM_in_samps = {var for var in CM_in_samps if var in KCM[drug_of_interest]}
PCM_in_samps = {var for var in CM_in_samps if var in PCM_filtered}
# RM
KRM_in_samps = set(flat_list([binary_table[samp]['KRM'] for samp in binary_table]))
PRM_in_samps = set(flat_list([binary_table[samp]['PRM'] for samp in binary_table]))

# samps
samps_CM = []
samps_CM_and_KRM = []
samps_CM_and_no_KRM = []
samps_CM_and_no_KRM_and_PRM = []
samps_CM_and_no_KRM_and_no_PRM = []
for samp in binary_table:
    if len(binary_table[samp]['CM']) > 0:
        samps_CM.append(samp)
        if len(binary_table[samp]['KRM']) > 0:
            samps_CM_and_KRM.append(samp)
        if len(binary_table[samp]['KRM']) == 0:
            samps_CM_and_no_KRM.append(samp)
            if len(binary_table[samp]['PRM']) > 0:
                samps_CM_and_no_KRM_and_PRM.append(samp)
            if len(binary_table[samp]['PRM']) == 0:
                samps_CM_and_no_KRM_and_no_PRM.append(samp)

summary_dict = {'n_CM_in_list_before_filtering': n_KCM + n_PCM_before_filtering,
 'n_KCM_in_list': n_KCM, 
 'n_PCM_in_list_before_filtering': n_PCM_before_filtering,
                     
 'n_CM_in_list_after_filtering': n_CM_after_filtering,
 'n_PCM_in_list_after_filtering': len(PCM_filtered),
 
 'n_CM_in_samps': len(CM_in_samps), 
 'n_KCM_in_samps': len(KCM_in_samps), 
 'n_PCM_in_samps': len(PCM_in_samps), 
 
 'n_KRM_in_samps': len(KRM_in_samps), 
 'n_PRM_in_samps': len(PRM_in_samps),
 
 "n_samps_PRM": len(PRM_dict), 
 
 "n_samps_CM": len(samps_CM), 
 "n_samps_CM_and_KRM": len(samps_CM_and_KRM),
 "n_samps_CM_and_no_KRM": len(samps_CM_and_no_KRM), 
 "n_samps_CM_and_no_KRM_and_PRM": len(samps_CM_and_no_KRM_and_PRM), 
 "n_samps_CM_and_no_KRM_and_no_PRM": len(samps_CM_and_no_KRM_and_no_PRM)}

print()
print("summary for", drug_of_interest)
for x in summary_dict:
    print(x, summary_dict[x])
print()

# write out binary_table
# with open(samps_vars_file, 'w', newline='') as f:
#     writer = csv.DictWriter(f, fieldnames = list(get_embedded_keys(binary_table)))
#     writer.writeheader()
#     for row in binary_table:
#         writer.writerow(binary_table[row])


summary for isoniazid
n_CM_in_list_before_filtering 33
n_KCM_in_list 22
n_PCM_in_list_before_filtering 11
n_CM_in_list_after_filtering 26
n_PCM_in_list_after_filtering 4
n_CM_in_samps 15
n_KCM_in_samps 11
n_PCM_in_samps 4
n_KRM_in_samps 208
n_PRM_in_samps 14
n_samps_PRM 74
n_samps_CM 515
n_samps_CM_and_KRM 343
n_samps_CM_and_no_KRM 172
n_samps_CM_and_no_KRM_and_PRM 25
n_samps_CM_and_no_KRM_and_no_PRM 147

dict_keys(['CM', 'KRM', 'PRM', 'other_vars'])


In [28]:
# RARE muations in the DR genes for the drug of interest
# i.e. example of CM samps with no PRM.
# However these samples will likely have a mutation in the relevant DR genes because they have CM
# But filtered because there is only one or two, or lineage specific
# use samps_CM_and_no_KRM_and_no_PRM

# Pull the DR mutations from these samps
rare_vars = []
n_samps_rare_vars = 0
for samp in samps_CM_and_no_KRM_and_no_PRM:
    rare_var_samp = [var for var in sample2mutation[samp]]
#     rare_var_samp = [var for var in sample2mutation[samp] if var[0] == 'katG']
    if len(rare_var_samp) > 0:
        rare_vars.append(rare_var_samp)
        n_samps_rare_vars += 1

print(rare_vars)
# How many samps have rare katg?
n_samps_rare_vars
# # How many distinct rare vars in this list?
rare_var_cnt = Counter(flat_list(rare_vars))
n_rare_vars = len(rare_var_cnt)
# # Table of counts
n_n_rare_vars = Counter(rare_var_cnt.values())

print()
print(" --- RARE MUTATIONS IN", drug_of_interest, "---")
print("n samps with a rare", drug_of_interest,"mutation :", n_samps_rare_vars)
print("n distinct rare", drug_of_interest, "vars:", n_rare_vars)
print("table of rare", drug_of_interest, "var occurrences:")
print(n_n_rare_vars)
print()

[[('ahpC', 'p.Leu191Arg')], [('kasA', 'c.-39C>T'), ('ahpC', 'c.-52C>T')], [('ahpC', 'c.-74G>A'), ('katG', 'p.Thr394Pro')], [('katG', 'p.Asp735Tyr'), ('ahpC', 'c.-54C>T')], [('ahpC', 'c.-48G>A'), ('katG', 'p.Thr625Lys')], [('katG', 'p.Leu43Arg'), ('ahpC', 'c.-72C>T')], [('katG', 'p.Leu43Arg'), ('ahpC', 'c.-72C>T')], [('ahpC', 'c.-48G>A'), ('katG', 'c.1438_1440delGCG')], [('ahpC', 'c.-51G>A'), ('katG', 'p.Gly182Arg')], [('ahpC', 'c.-51G>A'), ('katG', 'p.Gly182Arg')], [('katG', 'p.Asp329Ala'), ('ahpC', 'c.-48G>A'), ('inhA', 'c.-154G>A')], [('katG', 'p.Pro136Leu'), ('ahpC', 'c.-48G>A'), ('inhA', 'c.-154G>A')], [('katG', 'p.Pro136Leu'), ('ahpC', 'c.-48G>A'), ('inhA', 'c.-154G>A')], [('katG', 'p.Arg385Pro'), ('ahpC', 'c.-52C>T')], [('ahpC', 'c.-81C>T')], [('ahpC', 'c.-51G>A'), ('katG', 'p.Gly186Ser')], [('katG', 'c.984_998delGGACAACAGTTTCCT'), ('ahpC', 'c.-81C>T')], [('katG', 'p.His400Pro'), ('ahpC', 'c.-52C>T')], [('ahpC', 'c.-57C>T'), ('katG', 'p.Leu521Pro')], [('katG', 'p.Ala93Thr'), ('ka

In [None]:
# SUMMARY ^^^
# SUMMARY ^^^
# SUMMARY ^^^
# SUMMARY ^^^
# SUMMARY ^^^

In [None]:
# --------------------------------------------------------------------------------------------------------------

In [None]:
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS

In [None]:
co_gene = 'fabG1'
fabg_dr_mutations = {(var['Gene'], var['Mutation']) \
                     for var in tbdb_dict[drug_of_interest] \
                     if var['Gene'] == co_gene}

In [None]:
# ----------------------------------------------------------------------
# Co-occurrence with fabG1 in samples with potential INH res. mutations
# ----------------------------------------------------------------------
# potential_res_mut_filtered
# mutation2sample
# sample2mutation
# meta_dict
# drug_of_interest

co_gene_variants = []
for samp in PRM_dict:
    variants = sample2mutation[samp]
    co_gene_variants.append([v for v in variants if v in fabg_dr_mutations])

co_gene_variants = flat_list(co_gene_variants)

fabg_count = dict(Counter(co_gene_variants))

print("n samples having potential new resistance mutations which also have fabG1 DR mutations:")
print(fabg_count)
print("proportions:")
print({count: round(fabg_count[count]/len(PRM_dict), 3) for count in fabg_count})

In [None]:
# --------------------------------------------------------------------------------------------
# Compare to p.Ser315Thr - get proportion of samples with p.Ser315Thr that don't have a fabG1
# --------------------------------------------------------------------------------------------

katg_ser315thr = ('katG', 'p.Ser315Thr')
ser315thr_samps = []
Ser315Thr_fabg_samps = []
Ser315Thr_no_fabg_samps = []
ser315thr_comp_mut_samps = []
for samp in sample2mutation:
    mutations = sample2mutation[samp]

    # Total n samps with Ser315Thr
    if (katg_ser315thr in mutations):
        ser315thr_samps.append(samp)

    if (katg_ser315thr in mutations) and (any(var in mutations for var in fabg_dr_mutations)):
           Ser315Thr_fabg_samps.append(samp)

    if (katg_ser315thr in mutations) and not (any(var in mutations for var in fabg_dr_mutations)):
        Ser315Thr_no_fabg_samps.append(samp)
        
    # How many of the S315 samples have comp mutations?
    if (katg_ser315thr in mutations) and (any(var in mutations for var in compensatory_mutations['isoniazid'])):
        ser315thr_comp_mut_samps.append(samp)

Ser315Thr_fabg_prop = round(len(Ser315Thr_fabg_samps) / len(ser315thr_samps), 3)

print("total n samps with a Ser315Thr mutation: ", len(ser315thr_samps))
print("n samples with p.Ser315Thr and a fabG1 DR mutation: ", len(Ser315Thr_fabg_samps))
print("n samples with p.Ser315Thr and no fabG1 DR mutations: ", len(Ser315Thr_no_fabg_samps))
print("proportion: ", Ser315Thr_fabg_prop) # 0.158
print("n samps with a p.Ser315Thr mutation and a comp mutation: ", len(ser315thr_comp_mut_samps) )
print("proportion: ", round(len(ser315thr_comp_mut_samps) / len(ser315thr_samps), 3))

# 2 measures - fitness cost and levels of res
# fabg1 gives extra resistance
# therefore how 'resistant' are the new ones?

In [None]:
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS
# KATG POST-HOC ANALYSIS

In [None]:
# --------------------------------------------------------------------------------------------------------------

In [None]:
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING

In [None]:
# {
#     (('rpoB','p.Ser450Leu')): 100
#     (('rpoB','p.Ser450Leu'),('rpoC','p.Val187Leu')): 10,
#     (('rpoB','p.Ser450Leu'),('rpoC','p.Thr300Ala')): 12,
# }

In [None]:
sample2mutation

In [None]:
from collections import Counter

In [None]:
# rif_freq_table = dict(Counter([sample2mutation[samp] for samp in sample2mutation]))

rif_freq_table = dict(Counter([str(tuple(sample2mutation[samp])) for samp in sample2mutation]))


In [None]:
# # Directly from dictionary
# with open('results/rif_freq_table.json', 'w') as outfile:
#     outfile.write(json.dumps(rif_freq_table))

In [None]:
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING
# RIFAMPICIN TESTING