This notebook is made to check the presence of CyCOGs within species trees of groups of cyanobacteria. The other notebook makes gene trees, this notebook makes labels to check the gene within premade species trees.

In [1]:
from matplotlib import pyplot as plt
import os
import matplotlib as mpl
import numpy as np
import pandas as pd
import copy
import csv
from collections import Counter

from Bio import Phylo


In [5]:
# Pick CyCOG to generate labels for
CYCOG = 60000046

In [2]:
# Set dependencies
# REFS = '../data/genomes/'
# CYCOGS = 'data/0/serralysin_cycog_references.csv'
# CYCOG_LIST = 'data/0/cycogs.tsv'
# GENOMES = 'data/0/cycogsgenomes.tsv'
# CLADE_MAP = 'data/0/updated-genome-clades.csv'
# SEQ = "faa"

# add these back later

# Prune genomic trees to only include branches in the Berube et al (2018) reference genomes

## Inputs: 
- Newick files of genome phylogenies (from Paul)
- Names of genomes in CyCOGv6
- Clades of genomes in CyCOGv6

## Outputs:
- Pruned newick files of genome phylogenies
- Label files for visualizing pruned phylogenies


In [2]:
# input files

GENOMES = 'data/0/cycogsgenomes.tsv'
CLADE_MAP = 'data/0/updated-genome-clades.csv'

In [3]:
# read in data

genomes_df = pd.read_csv(GENOMES, sep='\t')
clade_df = pd.read_csv(CLADE_MAP)
# merge dataframes matching IMG_ID with IMGGENOMEID
label_df = pd.merge(genomes_df, clade_df, left_on='IMG_ID', right_on='IMGGenomeID', how='left') 

label_df

Unnamed: 0,IID,GROUP,IMG_ID,TYPE,JGI_GENOMEPORTAL_NAME,Completeness,IMGGenomeID,UpdatedIMGGenomeID,Clade
0,AG-311-D23,Prochlorococcus,2716884681,SAG,Uncultured_Prochlorococcus_sp._AG-311-D23,72.96,2.716885e+09,2.716885e+09,LLI
1,AG-311-I02,Prochlorococcus,2716884682,SAG,Uncultured_Prochlorococcus_sp._AG-311-I02,11.16,2.716885e+09,2.716885e+09,LLI
2,AG-311-I09,Prochlorococcus,2716884683,SAG,Uncultured_Prochlorococcus_sp._AG-311-I09,47.41,2.716885e+09,2.716885e+09,HLI
3,AG-311-J05,Prochlorococcus,2716884684,SAG,Uncultured_Prochlorococcus_sp._AG-311-J05,34.64,2.716885e+09,2.716885e+09,HLI
4,AG-311-J23,Prochlorococcus,2716884685,SAG,Uncultured_Prochlorococcus_sp._AG-311-J23,77.90,2.716885e+09,2.716885e+09,LLI
...,...,...,...,...,...,...,...,...,...
765,S-ShM2,Virus,651703106,ISOLATE,Synechococcus_phage_S-ShM2,0.00,,,
766,Syn19,Virus,651703107,ISOLATE,Synechococcus_phage_Syn19,0.00,,,
767,Syn5,Virus,641201056,ISOLATE,Synechococcus_phage_Syn5,0.00,,,
768,metaG-MbCM1,Virus,2595698410,ISOLATE,Synechococcus_phage_metaG-MbCM1,0.00,,,


In [5]:
# read in the genome phylogenies

hl_tree = Phylo.read('data/0/phylogeny/trees/HLprochlorococcus_5PERCto424cycogs_50percgaps_100boot_barnacle.nwk', 'newick')
ll_tree = Phylo.read('data/0/phylogeny/trees/LLprochlorococcus_5PERCto424cycogs_50percgaps_100boot_barnacle.nwk', 'newick')
syn_tree = Phylo.read('data/0/phylogeny/trees/synechococcus_5PERCto424cycogs_50percgaps_100boot_barnacle.nwk', 'newick')


In [6]:
# identify leaves in tree that are in CyCOGv6

Phylo.draw_ascii(syn_tree)

 , WH6501
 |
 | MITS9220
 |
 , AG-676-A21
 |
 , AG-673-D02
 |
 | AG-676-E23
 |
 , GEYO
 |
 , MIT9508
 |
 | , AG-686-D09
 |,|
 ||, BIOS-U3-1
 |||
 ||| AG-679-A04
 ||
 ||, UW179A
 |,|
 ,||, BIOS-E4-1
_||||
 || , MIT9504
 || |
 || | MIT9509
 ||
 ||   , MITS9507
 || __|
 |||  , MITS9501
 |||  |
 |||  | MITS9510
 |||
 ||| __ MITS9503
 ||||
 ||||        _ NOUM97013
 ||||    ___|
 ||||   |   | , A15-60
 ||||   |   |_|
 | ||   |     | A18-25c
 | ||   |
 | ||   |         , WH7803
 | ||   |        _|
 | ||   |       | | BMK-MC-1
 | ||   |  _____|
 | ||   | |     | __ WH7805
 | ||   | |     ||
 | ||   | |      |_ MEDNS5
 | ||   | |      |
 | ||   | |      | PROS-7-1
 |  |   | |
 |  |   | |              , AG-679-C18
 |  |   | |           ___|
 |  |   | |          |   | AG-323-G20
 |  |   | |          |
 |  |   | |          |   , ROS8604
 |  |   | |          |  ,|
 |  |   | |    ______|  |, WH8016
 |  |   | |   |      |  ||
 |  |   | |   |      |  || UW179B
 |  |   | |   |      |  |
 |  |   | |   |

In [7]:
# prune leaves from tree that aren't in CyCOGv6
# Phylo submodule of biopython: https://biopython.org/wiki/Phylo
# Tree.prune() method: https://biopython.org/docs/dev/api/Bio.Phylo.BaseTree.html#Bio.Phylo.BaseTree.TreeMixin.prune
# tutorial that mentions prune(): https://biopython-tutorial.readthedocs.io/en/latest/notebooks/13%20-%20Phylogenetics%20with%20Bio.Phylo.html

# repeat this for each of the three trees

tree_copy = copy.deepcopy(syn_tree)
for leaf in tree_copy.get_terminals():
    if leaf.name in label_df['IID'].tolist():
        continue
    else:
        tree_copy.prune(leaf.name)

Phylo.draw_ascii(tree_copy)

# save the pruned tree copy

 , AG-676-A21
 |
 , AG-673-D02
 |
 | AG-676-E23
 |
 , MIT9508
 |
 |  _ AG-686-D09
 |,|
 ||| AG-679-A04
 ||
 || , MIT9504
 ||_|
 || | MIT9509
 ||
 ||              _ WH7803
 ||         ____|
 ,|        |    |___ WH7805
 ||        |
_||        |              , AG-679-C18
 ||        |           ___|
 ||        |          |   | AG-323-G20
 ||        |          |
 ||        |   _______|  _ WH8016
 ||        |  |       | |
 ||        |  |       | |, AG-686-A03
 ||        |  |       |_,|
 ||        |  |         ||, AG-683-A02
 ||        |  |         |||
 ||        |  |         | | AG-686-A05
 ||        |  |         |
 ||        |  |         | , AG-683-A03
 ||        |  |         |,|
 ||        |  |         ||| AG-686-F08
 ||        |  |         ||
 ||        |  |          , CC9311
 ||        |  |          |
 ||        |  |          |_ WH8020
 ||        |  |
 ||        |  |             ______ KORDI-49
 ||        |  |            |
 ||________|  |            |           , AG-676-F02
 |         | 

# Generate label file for clades

In [11]:
# make lists of clades in each tree

genome_lists = {
    'HL': [], 'LL':[], 'Syn': []
}
for group in ['HL', 'LL', 'Syn']:
    # pull out appropriate tree
    tree = {'HL': hl_tree, 'LL':ll_tree, 'Syn': syn_tree}[group]
    # get list of genome names (leaves)
    genomes = []
    for leaf in tree.get_terminals():
        genomes.append(leaf.name)
    genome_lists[group] = genomes

genome_lists['HL']


['AG-347-I04',
 'AG-347-K19',
 'AG-355-P16',
 'AG-347-E23',
 'AG-422-O15',
 'AG-424-P16',
 'AG-347-L21',
 'AG-907-A17',
 'AG-461-P02',
 'AG-349-D02',
 'AG-470-J19',
 'AG-349-G23',
 'AG-893-P13',
 'AG-422-G21',
 'AG-903-M06',
 'AG-347-K16',
 'AG-469-M13',
 'AG-349-C08',
 'AG-453-M20',
 'AG-355-P15',
 'MIT9301',
 'AG-347-I06',
 'AG-355-J21',
 'AG-422-C16',
 'AG-900-D17',
 'AG-895-I05',
 'AG-347-G22',
 'AG-349-G15',
 'AG-891-O02',
 'AG-893-G11',
 'AG-360-L21',
 'AG-919-J19',
 'AG-912-G18',
 'AG-909-M08',
 'AG-359-F16',
 'AS9601',
 'AG-347-K23',
 'AG-459-D04',
 'AG-895-O22',
 'AG-426-L07',
 'AG-347-L19',
 'AG-455-E15',
 'AG-430-M18',
 'AG-457-D05',
 'AG-355-I04',
 'AG-909-E14',
 'AG-347-J05',
 'AG-453-O09',
 'AG-894-P13',
 'AG-900-C22',
 'AG-895-M05',
 'AG-355-K13',
 'AG-916-I03',
 'scB245a_518I6',
 'AG-898-I07',
 'AG-439-O21',
 'AG-895-C15',
 'AG-347-L17',
 'AG-355-G23',
 'AG-911-F21',
 'AG-901-B16',
 'AG-915-N17',
 'AG-895-D05',
 'AG-915-K06',
 'AG-900-L19',
 'scB241_526B19',
 'AG-914-D0

In [16]:
label_df[label_df.IID.isin(genome_lists['Syn'])].Clade.unique()

array(['5.1B-I', '5.3', '5.1A-II', '5.1A-III', '5.1A-UC-A-EnvC',
       '5.1B-CRD1', '5.1A-CRD2', '5.1A-unclassified', '5.1A-IV', '5.2',
       '5.1A-WPC1', 'LLIV', '5.1B-IX', '5.1B-V', '5.1B-VI', '5.1B-VIII'],
      dtype=object)

# Generate label file for CyCOG of interest

Obsolete code below, delete when finished with phylo code

In [52]:
mapping_df = label_df[['gene_id', 'IMG_ID', 'genome_name', 'Clade']].rename(columns={'IMG_ID': 'genome_id', 'Clade': 'clade'})
mapping_df['leaf_id'] = mapping_df['gene_id']
mapping_df = mapping_df[['leaf_id', 'gene_id', 'genome_id', 'genome_name', 'clade']]

mapping_df

Unnamed: 0,leaf_id,gene_id,genome_id,genome_name,clade
0,2507493065,2507493065,2507262052,WH8016,5.1B-I
1,2717726087,2717726087,2716884782,AG-442-B03,HLII
2,2653032043,2653032043,2651869831,scB245a_521K15,HLII
3,638311613,638311613,638275926,P-SSM2,
4,2667747375,2667747375,2667527274,AG-402-M23,LLI
...,...,...,...,...,...
941,2667895488,2667895488,2667527369,AG-363-I21,LLII.LLIII
942,2717711922,2717711922,2716884770,AG-418-G18,HLII
943,2667707121,2667707121,2667527251,AG-347-M18,HLII
944,2717319812,2717319812,2716884407,AG-418-L19,HLII


In [7]:
clade_lgnd = {
    "5.2": "#003D30", "5.3": "#00E5F8", "5.1A-CRD2": "#5A0A33", "5.1A-II": "#005745", "5.1A-III": "#810D49", "5.1A-IV": "#00735C",
    "5.1A-UC-A-EnvC": "#AB0D61", "5.1A-unclassified": "#009175", "5.1A-WPC1": "#D80D7B", "5.1B-CRD1": "#00AF8E", "5.1B-I": "#FF2E95", 
    "5.1B-IX": "#00CBA7", "5.1B-V": "#FF78AD", "5.1B-VI": "#00EBC1", "5.1B-VIII": "#FFACC6", "HLI": "#86FFDE", "HLII": "#FFD7E1",
    "HLII.HLVI": "#00306F", "HLIII": "#460B70", "HLIII.HLIV.HLV": "#00489E", "HLIV": "#6B069F", "HLVI": "#005FCC", "LLI": "#8E06CD",
    "LLI.LLVIII": "#0079FA", "LLI.LLIII": "#B40AFC", "LLIV": "#009FFA", "LLVII": "#ED0DFD", "LLVIII": "#00C2F9", "Blank": "#FF66FD",
    "": "#FF66FD", "LLII.LLIII": "#CDE494", "Unclassified": "#FF66FD", "AMZ-II": "#AA8F66"
}

In [9]:

clade_count = (Counter(label_df['Clade']))

In [10]:
clade_count

Counter({'HLII': 227,
         'HLI': 114,
         'LLI': 106,
         nan: 89,
         'LLVII': 58,
         'LLII.LLIII': 36,
         'LLIV': 26,
         '5.1B-CRD1': 18,
         'HLVI': 11,
         '5.1B-I': 10,
         '5.1A-IV': 9,
         '5.1A-II': 8,
         '5.1A-CRD2': 8,
         'AMZ-II': 7,
         'HLIII': 6,
         '5.2': 5,
         'HLIV': 5,
         '5.3': 4,
         '5.1A-UC-A-EnvC': 4,
         'HLII.HLVI': 3,
         '5.1A-III': 3,
         '5.1A-unclassified': 2,
         'LLVIII': 2,
         'LLI.LLVIII': 2,
         'Unclassified': 1,
         'HLIII.HLIV.HLV': 1,
         '5.1A-WPC1': 1,
         '5.1B-IX': 1,
         '5.1B-V': 1,
         '5.1B-VI': 1,
         '5.1B-VIII': 1})

In [21]:
filtered_clade_count = Counter({key: value for key, value in clade_count.items() if value >= 4})

print(filtered_clade_count)

clade_keys_list = list(filtered_clade_count.keys())

print(clade_keys_list)

Counter({'LLI': 416, 'HLII': 69, 'HLI': 5, '5.1B-CRD1': 4})
['LLI', 'HLII', 'HLI', '5.1B-CRD1']


In [None]:
if clade.startswith(("L", "A")):
# if clade.startswith(("H")):
# if clade[0].isdigit():
    

In [17]:
output_matches = "cyano_clades.tsv"
template_file = "clade_template_fin.txt"

# Open files to write data
with open("ones.txt", "w") as f, open("clades.txt", "w") as g, open("labels.txt", "w") as h:
    f.write('LEGEND_SHAPES\t')
    for key, value in clade_lgnd.items():
        if key.startswith(("L", "A")):
            f.write('1\t')
            g.write(f'{key}\t')
            h.write(f'{value}\t')

# Append additional legend information to ones.txt
with open("ones.txt", "a") as f, open("clades.txt", "r") as g, open("labels.txt", "r") as h:
    f.write(f'\nLEGEND_LABELS\t{g.read()}')
    f.write(f'\nLEGEND_COLORS\t{h.read()}')

# Clean up temporary files
os.remove('clades.txt')
os.remove('labels.txt')

# Read contents of the template files
with open('clade-template1.txt', 'r') as clade1:
    content1 = clade1.read()
with open('ones.txt', 'r') as clade2:
    content2 = clade2.read()
with open('clade-template3.txt', 'r') as clade3:
    content3 = clade3.read()

# Concatenate the contents with newline characters
cladestogether = content1 + "\n" + content2 + '\n' + content3

# Write the combined content to a new file
with open('clade_template_fin.txt', 'w') as template_clade:
    template_clade.write(cladestogether)

In [13]:
with open(output_matches, 'w', newline='') as outfile:  # Use 'w' mode to create a new file
    writer = csv.DictWriter(outfile, fieldnames=['ID', 'LABEL'], delimiter='\t')
    writer.writeheader()  # Write the header once at the beginning

    for index, row in label_df.iterrows():
        leaf_id = row['IID']
        clade = row['Clade']
        clade = str(clade)
        if clade.startswith(("L", "A")):
            value = clade_lgnd.get(clade)  # Use default color if clade not found
        
            writer.writerow({'ID': leaf_id, 'LABEL': value})

print(output_matches, "generated successfully.")

60000046-clade.tsv generated successfully.


In [18]:
file_list = ['clade-template1.txt', 'ones.txt', 'clade-template3.txt', 'cyano_clades.tsv']
output_file = os.path.join('data/0/', 'LL_clades.txt')
with open(output_file, 'w') as outfile:
    for fname in file_list:
        with open(fname, 'r') as infile:
            outfile.write(infile.read())
            outfile.write('\n')
os.remove('ones.txt')
os.remove(output_matches)
# os.remove(cycog_check)

In [16]:
# Join previously made tsv with template file to complete annotation file for iTOL
# Read the content of the output file
# output_template_file = os.path.join(f'data/{CYCOG}/', f"{CYCOG}-gene_clades.txt")
output_template_file = os.path.join(f'data/0/', "LL_clades.txt")

with open(output_matches, 'r') as output_f:
    output_content = output_f.read()

# Read the content of the template file and append the output content
with open(template_file, 'r') as template_f:
    template_content = template_f.read()
    # Add a newline character between the template and output content
    combined_content = template_content + "\n" + output_content

# Write the combined content to a new file
with open(output_template_file, 'w') as combined_f:
    combined_f.write(combined_content)


os.remove('ones.txt')
# os.remove('clade_template_fin.txt')

print(CYCOG, "annotation file generated successfully:", output_template_file, '\n')
os.remove(output_matches)
print ("Old outputs removed to clean directory. To run this block again generate a new output_matches file.") 
# output_matches file used to produce this block's output is removed for directory cleaning.

FileNotFoundError: [Errno 2] No such file or directory: 'clade_template_fin.txt'