# Important links
* https://github.com/ModelSEED/MicrobiomeNotebooks.git
* Workspace with Cliff's MAGs: https://narrative.kbase.us/narrative/155805
* Workspace with additional redundant MAGs: https://narrative.kbase.us/narrative/163264
* FAA of representative sequences: /scratch/fliu/data/cliff/mmseqs_ani_prob_rep_genome.faa  
* Cluster genome mapping: /scratch/fliu/data/cliff/mmseqs_ani_prob.json  

# Loading hit data from json file

In [None]:
%run cliffcommutil.py
with open("/scratch/fliu/data/cliff/mmseqs_ani_prob.json", 'r') as f:
    hit_data = json.load(f)

# Parsing hits for close genome list and establishing thresholds

In [17]:
%run cliffcommutil.py
mag_ani_data = {}
mag_thresholds = {}
mag_hit_distribution = {}
for protein in hit_data:
    for mag in hit_data[protein]:
        if mag not in mag_ani_data:
            mag_ani_data[mag] = {}
            mag_hit_distribution[mag] = [0] * 21
        for hit in hit_data[protein][mag]:
            genome = hit.split(":")[0]
            if hit_data[protein][mag][hit] < 100:
                if genome not in mag_ani_data[mag]:
                    mag_hit_distribution[mag][int(hit_data[protein][mag][hit]-80)] += 1
                mag_ani_data[mag][genome] = hit_data[protein][mag][hit]
util.save("mag_ani_data",mag_ani_data)
util.save("mag_hit_distribution",mag_hit_distribution)
for mag in mag_hit_distribution:
    if mag not in mag_thresholds:
        mag_thresholds[mag] = {"threshold":0,"threshold_count":0}
    threshold = 0
    threshold_count = 0
    for i in range(21):
        index = 20-i  
        if mag_hit_distribution[mag][index] > 0 and (threshold == 0 or threshold == 80+index+1):
            threshold_count += mag_hit_distribution[mag][index]
            threshold = 80+index
    mag_thresholds[mag]["threshold"] = threshold
    mag_thresholds[mag]["threshold_count"] = threshold_count
util.save("mag_thresholds",mag_thresholds)

# Identifying supplemental proteins for MAGs based on thresholds and hits

In [20]:
%run cliffcommutil.py
mag_protein_supplements = {}
mag_thresholds = util.load("mag_thresholds")
for protein in hit_data:
    for mag in hit_data[protein]:
        best_hit = None
        for hit in hit_data[protein][mag]:
            (genome,gene) = hit.split(":")
            if genome == "self":
                best_hit = None
                continue
            elif best_hit is None or hit_data[protein][mag][hit] > best_hit[2]:
                best_hit = (genome,gene,hit_data[protein][mag][hit])
        if best_hit is not None and best_hit[2] >= mag_thresholds[mag]["threshold"]:
            if mag not in mag_protein_supplements:
                mag_protein_supplements[mag] = {}
            mag_protein_supplements[mag][protein] = best_hit    
util.save("mag_protein_supplements",mag_protein_supplements)

# Getting workspace IDs for MAGs

In [None]:
%run cliffcommutil.py
mag_wsids = {}
mag_list = []
mags = util.msrecon.kbase_api.list_objects(155805, object_type="KBaseGenomes.Genome", include_metadata=False)
mag_list = mags
for item in mags:
    mag_wsids[item[1]] = item[7]
mags = util.msrecon.kbase_api.list_objects(163264, object_type="KBaseGenomes.Genome", include_metadata=False)
for item in mags:
    mag_wsids[item[1]] = item[7]
util.save("mag_wsids",mag_wsids)
util.save("mag_list",mag_list)

# Loading protein sequence data

In [None]:
%run cliffcommutil.py
from Bio import SeqIO
protein_hash = {}
for record in SeqIO.parse('/scratch/fliu/data/cliff/mmseqs_ani_prob_rep_genome.faa', 'fasta'):
    protein_hash[record.id] = record.seq

# Building genomes and assemblies

In [None]:
%run cliffcommutil.py
mag_list = util.load("mag_list")
mag_protein_supplements = util.load("mag_protein_supplements")
for item in mag_list:
    genome_obj = util.get_object()
    #print assembly file
    #add additional proteins to genome and assembly
    if item[1] in mag_protein_supplements:
        count = 1
        for protein in mag_protein_supplements[item[1]]:
            ftrid = 'pangenome_'+str(count)
            count += 1
            newftr = {
                "aliases": [],
                "cdss": [
                    ftrid+".CDS"
                ],
                "functions":["unannotated"],
                "dna_sequence": ftr["dna_sequence"],
                "dna_sequence_length": len(ftr["dna_sequence"]),
                "id": ftrid,
                "location": [
                    [
                        ftrid+".contig",
                        1,
                        "+",
                        len(ftr["dna_sequence"])
                    ]
                ],
                "md5": dnamd5,
                "ontology_terms": {},
                "protein_md5": md5,
                "protein_translation": ftr["protein_translation"],
                "protein_translation_length": len(ftr["protein_translation"]),
                "warnings": []
            }
            
            
            
            
            genome_obj["data"]["features"].append({
                "id": protein+".supplement",
                "type": "CDS",
                "location": [[0,1]],
                "qualifiers": {
                    "translation": str(protein_hash[mag_protein_supplements[item[1]][protein][1]]),
                    "note": "Supplemental protein from "+mag_protein_supplements[item[1]][protein][0]
                }
            })
            #printing the supplemental protein in the fasta file
    #Saving assembly and MAG
    util.save(item[1],genome_obj)

# Loading assemblies to KBase

32


# Loading genomes to KBase

# Building models