# Important links
* https://github.com/ModelSEED/MicrobiomeNotebooks.git
* Workspace with Cliff's MAGs: https://narrative.kbase.us/narrative/186678 (https://narrative.kbase.us/narrative/155805)
* The pangenome enhanced MAGs, annotated with RAST, GLM4EC and DRAM are here: https://narrative.kbase.us/narrative/187040
# * Workspace with additional redundant MAGs: https://narrative.kbase.us/narrative/163264
* FAA of representative sequences: /scratch/fliu/data/cliff/mmseqs_ani_prob_rep_genome.faa  
* Cluster genome mapping: /scratch/fliu/data/cliff/mmseqs_ani_prob_v2.json  |  /scratch1/fliu/hub_scratch/fliu/data/cliff/mmseqs_ani_prob_v2.json

In [None]:
mag_ws = 186678
pangenome_mag_ws = 187040
prob_mapping = "/scratch/fliu/data/cliff/mmseqs_ani_prob_v2.json"
prob_rep_mapping = "/scratch/fliu/data/cliff/mmseqs_ani_prob_rep_genome_v2.faa"

# Loading hit data from json file

In [None]:
%run cliffcommutil.py
with open(prob_mapping, 'r') as f:
    hit_data = json.load(f)

# Parsing hits for close genome list and establishing thresholds

In [17]:
%run cliffcommutil.py
mag_ani_data = {}
mag_thresholds = {}
mag_hit_distribution = {}
for protein in hit_data:
    for mag in hit_data[protein]:
        if mag not in mag_ani_data:
            mag_ani_data[mag] = {}
            mag_hit_distribution[mag] = [0] * 21
        for hit in hit_data[protein][mag]:
            genome = hit.split(":")[0]
            if hit_data[protein][mag][hit] < 100:
                if genome not in mag_ani_data[mag]:
                    mag_hit_distribution[mag][int(hit_data[protein][mag][hit]-80)] += 1
                mag_ani_data[mag][genome] = hit_data[protein][mag][hit]
util.save("mag_ani_data",mag_ani_data)
util.save("mag_hit_distribution",mag_hit_distribution)
for mag in mag_hit_distribution:
    if mag not in mag_thresholds:
        mag_thresholds[mag] = {"threshold":0,"threshold_count":0}
    threshold = 0
    threshold_count = 0
    for i in range(21):
        index = 20-i  
        if mag_hit_distribution[mag][index] > 0 and (threshold == 0 or threshold == 80+index+1):
            threshold_count += mag_hit_distribution[mag][index]
            threshold = 80+index
    mag_thresholds[mag]["threshold"] = threshold
    mag_thresholds[mag]["threshold_count"] = threshold_count
util.save("mag_thresholds",mag_thresholds)

# Identifying supplemental proteins for MAGs based on thresholds and hits

In [20]:
%run cliffcommutil.py
mag_protein_supplements = {}
mag_thresholds = util.load("mag_thresholds")
for protein in hit_data:
    for mag in hit_data[protein]:
        best_hit = None
        for hit in hit_data[protein][mag]:
            (genome,gene) = hit.split(":")
            if genome == "self":
                best_hit = None
                continue
            elif best_hit is None or hit_data[protein][mag][hit] > best_hit[2]:
                best_hit = (genome,gene,hit_data[protein][mag][hit])
        if best_hit is not None and best_hit[2] >= mag_thresholds[mag]["threshold"]:
            if mag not in mag_protein_supplements:
                mag_protein_supplements[mag] = {}
            mag_protein_supplements[mag][protein] = best_hit    
util.save("mag_protein_supplements",mag_protein_supplements)

# Getting workspace IDs for MAGs

In [None]:
%run cliffcommutil.py
mag_wsids = {}
mags = util.msrecon.kbase_api.list_objects(mag_ws, object_type="KBaseGenomes.Genome", include_metadata=False)
for item in mags:
    mag_wsids[item[1]] = item[7]
util.save("mag_wsids",mag_wsids)
util.save("mag_list",mags)

# Loading protein sequence data

In [None]:
%run cliffcommutil.py
from Bio import SeqIO
protein_hash = {}
for record in SeqIO.parse(prob_rep_mapping, 'fasta'):
    protein_hash[record.id] = record.seq

# Building genomes and assemblies

In [None]:
%run cliffcommutil.py
mag_list = util.load("mag_list")
mag_protein_supplements = util.load("mag_protein_supplements")
for item in mag_list:
    genome_obj = util.get_object()
    firstgene = genome_obj["features"][0]
    #print assembly file
    #add additional proteins to genome and assembly
    if item[1] in mag_protein_supplements:
        count = 1
        for protein in mag_protein_supplements[item[1]]:
            ftrid = 'pangenome_'+str(count)
            count += 1
            protseq = protein_hash[protein]
            #TODO why scrub the dictionary through MD5?
            dnaseq = util.translate_protein_to_gene(protseq)
            result = hashlib.md5(protseq.encode())
            md5 = result.hexdigest()
            result = hashlib.md5(dnaseq.encode())
            dnamd5 = result.hexdigest()
            newftr = {
                "aliases": [protein],
                "cdss": [
                    ftrid+".CDS"
                ],
                "functions":["Hypothetical protein"],
                "dna_sequence": dnaseq,
                "dna_sequence_length": 3*len(protseq),
                "id": ftrid,
                "location": [
                    [
                        firstgene["location"][0][0],
                        1,
                        "+",
                        3*len(protseq)
                    ]
                ],
                "md5": dnamd5,
                "ontology_terms": {},
                "protein_md5": md5,
                "protein_translation": protseq,
                "protein_translation_length": len(protseq),
                "warnings": []
            }
            cdsftr = newftr.copy()
            del cdsftr["cdss"]
            cdsftr["id"] = ftrid+".CDS"
            cdsftr["parent_gene"] = ftrid
            genome_obj["features"].append(newftr)
            genome_obj["cdss"].append(cdsftr)
    #Saving MAG
    util.save(item[1],genome_obj)

# Loading genomes to KBase

In [None]:
#Loading clade genome assemblies to KBase using SDK callbacks
import shutil
util.sdk_dir_perms()
afu = util.afu_client()
asvset_genomes = util.load("asvset_genomes")
asvset_names = util.load("asvset_names")
for asvset in asvset_genomes:
    asvname = asvset_names[asvset]
    shutil.copy2("Assemblies/"+asvname+".fasta", '/scratch/shared/sdkmount/kb_sdk_home/run_local/workdir/tmp/'+asvname+".fasta")
    params = {
        'file': {
            'path': '/kb/module/work/tmp/'+asvname+".fasta"
        },
        'workspace_id': 181152,
        'assembly_name': asvname+".assembly"
    }
    result = afu.save_assembly_from_fasta2(params)

# Building models