# Summary
In this notebook the strain mappings file for the data from Crüsemann et al. 2017, named Crusemann dataset, is created.

1. Merge strain_mapping files from PoDP of projects MSV000078836 MSV000078839 MSV000079284. NPLinker (through docker) was run on these three accessions. AS3 output files for all strains and AS6 output for most strains are also provided to include in the merged strain mappings file.

## 1. Getting correct strain mappings file
- Merge the three accessions' strain_mappings.csv
- Sorting out AS files, with merged AS6 and AS3 output
- Writing out a bash file with all the files to be copied

In [28]:
import os
import sys
import glob
sys.path.append('../../prototype')
from nplinker.nplinker import NPLinker

In [197]:
new_data_path = "/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/"
os.path.isdir(new_data_path)

True

### Merge strain mappings from the three accessions

In [180]:
## Merging 3 PoDP strain mappings files
podp_data_dummy = "/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/nplinker_data/pairedomics/extracted/{}/strain_mappings.csv"
names = ["MSV000078836", "MSV000078839", "MSV000079284"]
str_map_files = []
for name in names:
    str_map_files.append(podp_data_dummy.format(name))
str_map_files

['/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/nplinker_data/pairedomics/extracted/MSV000078836/strain_mappings.csv',
 '/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/nplinker_data/pairedomics/extracted/MSV000078839/strain_mappings.csv',
 '/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/nplinker_data/pairedomics/extracted/MSV000079284/strain_mappings.csv']

In [279]:
read_strain_mappings = []
for str_map_file in str_map_files:
    str_map = {}
    with open(str_map_file) as inf:
        for line in inf:
            line = line.strip().split(',')
            add_mappings = line[1:]
            str_mappings = []
            # make sure both GCA and GCF end up in the mappings as I don't know which is used for AS6
            for str_mapping in add_mappings:
                str_mappings.append(str_mapping)
                if str_mapping.startswith("GCA_"):
                    genbank_refseq = ("GCF" + str_mapping[3:]).split(".")[0]
                    if genbank_refseq not in str_mappings:
                        str_mappings.append(genbank_refseq)
                if str_mapping.startswith("GCF_"):
                    genbank_refseq = ("GCA" + str_mapping[3:]).split(".")[0]
                    if genbank_refseq not in str_mappings:
                        str_mappings.append(genbank_refseq)
            str_map[line[0]] = str_mappings
    read_strain_mappings.append(str_map)
len(read_strain_mappings), list(read_strain_mappings[0].items())[0]

(3,
 ('Salinispora arenicola CNT005',
  ['NZ_AZWU01000030',
   'NZ_AZWU01000054',
   'NZ_AZWU01000052',
   'NZ_AZWU01000039',
   'NZ_KI911492',
   'NZ_AZWU01000029',
   'NZ_AZWU01000023',
   'NZ_KI911494',
   'NZ_AZWU01000018',
   'T005-3-Me.mzXML',
   'NZ_AZWU01000045',
   'NZ_AZWU01000031',
   'NZ_KI911493',
   'T005-3-Bu.mzXML',
   'NZ_AZWU01000036',
   'NZ_AZWU01000038',
   'NZ_AZWU01000021',
   'GCF_000514895',
   'GCA_000514895',
   'GCF_000514895.1',
   'NZ_AZWU01000007',
   'NZ_AZWU01000019',
   'NZ_AZWU01000064',
   'NZ_AZWU01000020',
   'NZ_AZWU01000017',
   'NZ_AZWU01000047',
   'T005-3-EA.mzXML',
   'NZ_AZWU01000046',
   'NZ_AZWU01000022',
   'NZ_KI911490',
   'NZ_AZWU01000037']))

### Merge AS3 files to strains mapping
- file originating from Crusemann study/MolNetEnhancer study of this data

In [282]:
# file contains AS3_id,strain_id -> dict(strain_id: AS3_id)
as3_mappings_file = os.path.join(new_data_path, "strain_ids_as3.csv")
print(os.path.isfile(as3_mappings_file))
as3_mappings = {}
with open(as3_mappings_file) as inf:
    for line in inf:
        line = line.strip()
        if line:
            line = line.split(",")
            add_mappings = [line[0]]
            str_mappings = []
            # make sure both GCA and GCF end up in the mappings as I don't know which is used for AS6
            for str_mapping in add_mappings:
                str_mappings.append(str_mapping)
                if str_mapping.startswith("GCA_"):
                    genbank_refseq = ("GCF" + str_mapping[3:]).split(".")[0]
                    if genbank_refseq not in str_mappings:
                        str_mappings.append(genbank_refseq)
                if str_mapping.startswith("GCF_"):
                    genbank_refseq = ("GCA" + str_mapping[3:]).split(".")[0]
                    if genbank_refseq not in str_mappings:
                        str_mappings.append(genbank_refseq)
            as3_mappings[line[1]] = str_mappings
print(list(as3_mappings.items())[:10])

True
[('CNT148', ['AZWJ01000000']), ('CNY280', ['ARHH01000000']), ('CNB440', ['GCA_000016425', 'GCF_000016425']), ('CNT084', ['AZWA01000000']), ('CNH898', ['AZXH01000000']), ('CNB091', ['ARJI01000000']), ('CNR107', ['ARKT01000000']), ('CNY202', ['AXVR01000001']), ('CNT796', ['AZWM01000000']), ('CNY230', ['35133'])]


In [283]:
# take first strain mapping as ref and add the others
from copy import deepcopy
ref_map = deepcopy(read_strain_mappings[0])
read_strain_mappings.append(as3_mappings)

for add_map in read_strain_mappings[1:]:
    print("Adding new strain mappings")
    for ori_strain, mapping in add_map.items():
        strain = ori_strain
        ref_mapping = ref_map.get(strain, None)
        if not ref_mapping:
            # only strain name present in add_map
            full_strain = [ref_strain for ref_strain in ref_map if ref_strain.endswith(strain)]
            if full_strain:
                strain = full_strain[0]
                ref_mapping = ref_map.get(strain, None)
                print(" mapped inpartial strain names:", strain, ori_strain)
        if not ref_mapping:
            # only strain name present in ref_map
            partial_strain = [ref_strain for ref_strain in ref_map if strain.endswith(ref_strain)]
            if partial_strain:
                strain = partial_strain[0]
                ref_mapping = ref_map.get(strain, None)
                print(" mapped inpartial strain names:", strain, ori_strain)

        # sure that add_map strain not in ref_map: add to ref_map or merge when ref_mapping exists
        if ref_mapping:
            # merge add_map in ref_map
            for map_elem in mapping:
                if map_elem not in ref_mapping:
                    ref_mapping.append(map_elem)
            ref_map[strain] = ref_mapping
        else:
            # add add_map in ref_map
            ref_map[strain] = mapping
print(len(ref_map))

Adding new strain mappings
Adding new strain mappings
Adding new strain mappings
 mapped inpartial strain names: Salinispora pacifica CNT148 CNT148
 mapped inpartial strain names: Salinispora arenicola CNY280 CNY280
 mapped inpartial strain names: Salinispora tropica CNB440 CNB440
 mapped inpartial strain names: Salinispora pacifica CNT084 CNT084
 mapped inpartial strain names: Salinispora tropica CNH898 CNH898
 mapped inpartial strain names: Streptomyces sp. CNB091 CNB091
 mapped inpartial strain names: Salinispora arenicola CNR107 CNR107
 mapped inpartial strain names: Salinispora pacifica CNY202 CNY202
 mapped inpartial strain names: Salinispora pacifica CNT796 CNT796
 mapped inpartial strain names: Salinispora arenicola CNY230 CNY230
 mapped inpartial strain names: Salinispora pacifica CNT584 CNT584
 mapped inpartial strain names: Salinispora tropica CNY012 CNY012
 mapped inpartial strain names: Salinispora arenicola CNH964 CNH964
 mapped inpartial strain names: Salinispora pacific

## Sort out new (as6) or old (as3) files to antismash folder
- I ran AS6 on the RefSeq/GenBank entries in the strain mappings files of the three accessions
- I already had AS3 output for all strains in these accessions - linked through strain_ids_as3.csv
- Alternatively, use the AS-db output as automatically downloaded by NPLinker
- First figure out if there are AS6 runs on additional refseq/genbank accessions that are not yet in the strain mappings file
- New antismash results for any of the strains with RefSeq/GenBank can also be easily added now by just adding an assembly to the AS6 folder and maybe rescuing it to the correct strain mapping.

In [284]:
as6_loc = os.path.join(new_data_path, "AS6_assemblies_MSV000078836_MSV000078839_MSV000079284")
as6_dirs = glob.glob(os.path.join(as6_loc, "*"))

as3_dir = os.path.join(new_data_path, "as3_files")
as3_files = glob.glob(os.path.join(as3_dir, "*.gbk"))

print(os.path.isdir(as6_loc), os.path.isdir(as3_dir))


antismash_dir = os.path.join(new_data_path, "antismash")
if not os.path.isdir(antismash_dir):
    os.mkdir(antismash_dir)

True True


### Are there any RefSeq/GenBank accessions in AS folder that are not in strain mappings yet?


In [285]:
# Are there any RefSeq/GenBank accessions in AS folder that are not in strain mappings yet?
accs = []
for vals in ref_map.values():
    for val in vals:
        if val.startswith("GCF_") or val.startswith("GCA_"):
            split_val = "GCF" + val.split(".1")[0][3:]
            if split_val not in accs:
                accs.append(split_val)
                accs.append("GCA" + split_val[3:])
print(len(accs))  # double as both GCA and GCF
passed = []
for acc in accs:
    passed_as6_dir = [as6_dir for as6_dir in as6_dirs if acc in as6_dir]
    if passed_as6_dir:
        passed.append(passed_as6_dir[0])
not_passed = [as6_dir for as6_dir in as6_dirs if as6_dir not in passed]
len(not_passed)

170


20

### Look in those 20 AS outputs to see which organism they are

In [286]:
# read strain names from 20 AS outputs
# as6_file_dict = {}
as6_dir_strain_dict = {}
for as6_dir in as6_dirs:
    gbk_name = os.path.split(as6_dir)[1] + ".gbk"
    main_gbk = os.path.join(as6_dir, gbk_name)
    org_names = []
    with open(main_gbk) as inf:
        for x in range(500):
            line = inf.readline().strip()
            # get source and organism
            if line.startswith("SOURCE") or line.startswith("ORGANISM"):
                org_name = line.partition(" ")[-1].strip()
                if org_name not in org_names:
                    org_names.append(org_name)
    as6_dir_strain_dict[as6_dir] = org_names
#     for org_name in org_names:
#         as6_file_dict[org_name] = gbk_name

In [287]:
fails = 0
added = 0
for as6_dir in not_passed:
    strain_name = as6_dir_strain_dict[as6_dir][0]  # I know they're all len(1)
    strain_name_mapping = ref_map.get(strain_name)
    
    # manually rescue
    if strain_name == "Streptomyces himastatinicus ATCC 53653":
        strain_name = "Streptomyces hygroscopicus"
        strain_name_mapping = ref_map.get(strain_name)
    elif strain_name == "Streptomyces afghaniensis 772":
        strain_name = "Streptomyces afghanensis"
        strain_name_mapping = ref_map.get(strain_name)
    elif strain_name == "Streptomyces filamentosus NRRL 11379":
        strain_name = "Streptomyces roseosporus"
        strain_name_mapping = ref_map.get(strain_name)
    
    # rescue by chopping off info not in strain_mappings
    if not strain_name_mapping:
        strain_name = " ".join(as6_dir_strain_dict[as6_dir][0].split(" ")[:2])
        strain_name_mapping = ref_map.get(strain_name)
    if not strain_name_mapping:
        strain_name = " ".join(as6_dir_strain_dict[as6_dir][0].split(" ")[:3])
        strain_name_mapping = ref_map.get(strain_name)
    
    # when success
    if strain_name_mapping:
        added += 1
        assert ref_map.get(strain_name), "Something weird with strain_name"
        refseq_genbank = os.path.split(as6_dir)[1][:13]
        if not refseq_genbank in strain_name_mapping:
            strain_name_mapping.append(refseq_genbank)
#         print(ref_map.get(strain_name))
#         print("success", as6_dir, strain_name, refseq_genbank)
        pass
    else:
        fails += 1
        print("failure", as6_dir, strain_name, as6_dir_strain_dict[as6_dir])
print(added, fails)

20 0


### Add all the regions from the GCF folders into strain mappings
- When encountering a folder with GCF/GCA figure out that all the antismash files in that folder should be mapped to the strain

In [312]:
# get names for all AS dirs
as6_file_dict = {}
for as6_dir in as6_dirs:
    as6_files = [os.path.split(as6_file)[1] for as6_file in glob.glob(os.path.join(as6_dir, "*.gbk"))]
    # filter out the main file and turn into file names (without path)
    as6_files = [as6_file for as6_file in as6_files
                 if not as6_file.startswith("GCF_") and not as6_file.startswith("GCA_")]
    as6_names = []
    for as6_file in as6_files:
        as6_name = as6_file.split(".")[0]
        if as6_name not in as6_names:
            as6_names.append(as6_name)
    as6_file_dict[as6_dir] = as6_names
print("Number of AS dirs passed:", len(as6_file_dict), "Example:")
print(list(as6_file_dict.items())[0])

# add names to strain mapping
as6_dirs_done = []
for strain_name, vals in ref_map.items():
    str_done = False
    for val in vals:
        if str_done:
            # strain has encountered a GCF or GCA previously and is done
            break
        if val.startswith("GCF_") or val.startswith("GCA_"):
            acc = val
            as6_dir = [as6_dir for as6_dir in as6_file_dict if acc in as6_dir]
            if as6_dir:
                if len(as6_dir) > 1:
                    print("  multiple AS output for strain", strain_name, as6_dir)
                as6_dir = as6_dir[0]
                if as6_dir not in as6_dirs_done:
                    # lookup the as6_names to add to the strain mapping
                    as6_names = as6_file_dict[as6_dir]
                    for as6_name in as6_names:
                        if as6_name not in vals:
                            vals.append(as6_name)
                    as6_dirs_done.append(as6_dir)
                    str_done = True

failed_as6_dirs = [as6_dirs for as6_dirs in as6_file_dict if as6_dirs not in as6_dirs_done]
print("")
print(len(as6_dirs_done), "AS dirs processed and added to strain mapping.", len(failed_as6_dirs), "AS dirs failed:",
     failed_as6_dirs)

Number of AS dirs passed: 104 Example:
('/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/AS6_assemblies_MSV000078836_MSV000078839_MSV000079284/GCF_000375165.1_ASM37516v1_genomic', ['NZ_KB896530', 'NZ_KB896534', 'NZ_KB896538', 'NZ_KB896536', 'NZ_KB896532', 'NZ_KB896533', 'NZ_KB896540', 'NZ_KB896535', 'NZ_KB896539', 'NZ_KB896529', 'NZ_KB896557', 'NZ_KB896548', 'NZ_KB896572', 'NZ_KB896552', 'NZ_KB896562', 'NZ_KB896543', 'NZ_KB896537', 'NZ_KB896547', 'NZ_KB896546'])

104 AS dirs processed and added to strain mapping. 0 AS dirs failed: []


### Manually correct some spelling errors

In [317]:
# for S.TU6071_R5_B.mzXML change to capital letters in strain mapping
first_strain = "Streptomyces sp. Tu6071"
first_mapping = ref_map[first_strain]
corr_first_mapping = []
for mapping in first_mapping:
    if mapping == "S.Tu6071_R5_B.mzXML":
        mapping = "S.TU6071_R5_B.mzXML"
    if mapping == "S.Tu6071_R5_M.mzXML":
        mapping = "S.TU6071_R5_M.mzXML"
    if mapping == "S.Tu6071_R5_E.mzXML":
        mapping = "S.TU6071_R5_E.mzXML"
    corr_first_mapping.append(mapping)
ref_map[first_strain] = corr_first_mapping
print(ref_map[first_strain])

['S.Tu6071_A1_M.mzXML', 'S.Tu6071_MS_M.mzXML', 'S.Tu6071_MS_B.mzXML', 'S.Tu6071_MS_E.mzXML', 'S.TU6071_R5_B.mzXML', 'S.Tu6071_A1_E.mzXML', 'S.Tu6071_A1_B.mzXML', 'S.TU6071_R5_M.mzXML', 'S.TU6071_R5_E.mzXML', 'GCA_000213055', 'CM001165']


In [320]:
sec_strain = "Streptomyces sp.MG1"
sec_cor_strain = "Streptomyces sp. MG1"
sec_mapping = ref_map.get(sec_strain)
if sec_mapping:
    del ref_map[sec_strain]
    ref_map[sec_cor_strain] = sec_mapping
print(ref_map[sec_cor_strain], ref_map.get(sec_strain))

['S.MG1_A1_E.mzXML', 'NZ_CP011665', 'GCF_000412265', 'GCA_000412265', 'S.MG1_A1_M.mzXML', 'S.MG1_R5_M.mzXML', 'S.MG1_MS_B.mzXML', 'GCF_000412265.2', 'S.MG1_R5_B.mzXML', 'S.MG1_MS_M.mzXML', 'S.MG1_A1_B.mzXML', 'NZ_CP011664', 'S.MG1_R5_E.mzXML', 'NZ_CP011667', 'S.MG1_MS_E.mzXML', 'CP011665', 'CP011664', 'CP011667'] None


In [322]:
th_strain = "Salinispora pacifica CNT851"
th_mapping = ref_map.get(th_strain)
add_to_th_mapping = "ARHL01000000"
if th_mapping:
    if add_to_th_mapping not in th_mapping:
        print('added', add_to_th_mapping)
        th_mapping.append(add_to_th_mapping)
print(ref_map.get(th_strain))

['T851-3-Me.mzXML', 'T851-3-Bu.mzXML', 'T851-3-EA.mzXML', '35a.mzXML', '35b.mzXML', 'KB896839', 'ARHL01000000']


### Write final merged strain mappings

In [323]:
new_data_str_mappings = os.path.join(new_data_path, "strain_mappings.csv")
with open(new_data_str_mappings, 'w') as outf:
    for k_strain, v_mapping in sorted(ref_map.items()):
        outf.write("{},{}\n".format(k_strain, ','.join(sorted(v_mapping))))

### Actually sort out AS files

In [276]:
# loop through all strains and gather the RefSeq/GenBank to look up new AS, otherwise old antismash
# first look if in new antismash folder, then look in old antismash folder
# write all cp commands to bash script to be run in a screen (use e.a. subprocess if you want to do it here)
done = []  # list to keep track of already copied AS files
cp_commands = []
no_antismash = []

for strain, mappings in ref_map.items():
    as_maps = []
    gcfs = []
    for mapping in mappings:
        if not mapping.endswith(".mzXML"):
            if mapping.startswith("GCF") or mapping.startswith("GCA"):
                stripped = mapping.partition('.')[0]
                if stripped not in gcfs:
                    gcfs.append(stripped)
            else:
                as_maps.append(mapping)
    if gcfs:
        # copy AS6
        print(strain)
        if len(gcfs) > 1:
            double_acc = [1 for gcf in gcfs if not all([gcf_check.endswith(gcf[4:]) for gcf_check in gcfs])]
            if double_acc:
                print("refseqs len > 1 for", strain)
        gcf_acc = gcfs[0]
        gca_acc = "GCA" + gcf_acc[3:]
        acc_as6_dir_l = [as6_dir for as6_dir in as6_dirs if gcf_acc in as6_dir or gca_acc in as6_dir]
        if acc_as6_dir_l:
            acc_as6_dir = acc_as6_dir_l[0]
            # print(acc_as6_dir[0])
            # add to done, copy
            if not acc_as6_dir in done:
                cp_cmd = f"cp -r {acc_as6_dir} {antismash_dir}"
#                 print(' ', cp_cmd)
                cp_commands.append(cp_cmd)
                done.append(acc_as6_dir)
        else:
            print("missing for", gcf_acc, strain)
    else:
        # copy AS3
        print(strain, as_maps)
        as3_files_to_copy = None
        for as_map in as_maps:
            as3_files_to_copy = [as3_file for as3_file in as3_files if as_map in as3_file]

        if not as3_files_to_copy:
            print("  no antismash files")
            no_antismash.append(strain)
        else:
            for as3_file_to_copy in as3_files_to_copy:
                if not cp_as3_cmd in done:
                    cp_as3_cmd = f"cp {as3_file_to_copy} {antismash_dir}"
                    cp_commands.append(cp_as3_cmd)
    #                 print(' ', cp_as3_cmd)
                    done.append(as3_file_to_copy)

print(len(no_antismash), "strains do not have any antismash files:")
print(no_antismash)
print(len(cp_commands), "files/dirs will be copied")

Salinispora arenicola CNT005
Salinispora arenicola CNR107
Salinispora pacifica CNS801 ['34966']
Salinispora arenicola CNY011
Salinispora arenicola CNX508
Salinispora arenicola CNY237
Salinispora tropica CNS416 ['ARHQ01000000']
Salinispora arenicola CNH996 ['JNLS01000000']
Salinispora pacifica CNT138 ['KB911579']
  no antismash files
Salinispora pacifica CNT001
Salinispora arenicola CNY685 ['35845']
Salinispora arenicola CNY230 ['35133']
Salinispora arenicola CNQ884 ['34969']
Salinispora arenicola CNT849
Salinispora pacifica CNH732 ['37235']
Salinispora tropica CNH898
Salinispora pacifica CNY498 ['35607']
Salinispora pacifica CNT584
Salinispora pacifica CNY703 ['35561']
Salinispora pacifica CNS237
Salinispora pacifica CNT045
Salinispora pacifica CNY330
Salinispora arenicola CNR921
Salinispora arenicola CNH964 ['JAEY01000000']
Salinispora pacifica CNT403 ['34964']
Salinispora pacifica CNT133
Salinispora pacifica CNQ768
Salinispora arenicola CNS325 ['37242']
Salinispora arenicola CNS296 [

In [263]:
cp_script = os.path.join(new_data_path, "get_antismash_files.sh")
with open(cp_script, 'w') as outf:
    outf.write("#!/usr/bin/env bash\n")
    for cp_command in cp_commands:
        outf.write(f"{cp_command}\n")