## Generate annotation file tagging genome name with presence of specified CyCOG

This notebook is made to check the presence of CyCOGs within species trees of groups of cyanobacteria. The other notebook makes gene trees, this notebook makes labels to check the gene within premade species trees.

In [2]:
from matplotlib import pyplot as plt
import os
import matplotlib as mpl
import numpy as np
import pandas as pd
import copy
import csv
from collections import Counter
from Bio import Phylo


In [2]:
# Set dependencies
REFS = '../data/genomes/'
CYCOGS = 'data/0/serralysin_cycog_references.csv'
CYCOG_LIST = 'data/0/cycogs.tsv'
GENOMES = 'data/0/cycogsgenomes.tsv'
CLADE_MAP = 'data/0/updated-genome-clades.csv'
SEQ = "faa"


In [5]:
# Pick CyCOG to generate labels for
CYCOG = 60000046

In [None]:
# 1 & 2: extract protein IDs and associated genome names for all proteins in the CyCOG
cycog_df = pd.read_csv(CYCOG_LIST, sep="\t")
protein_ids = [] # empty list to store protein IDs
for i, row in cycog_df.iterrows(): # searches through rows in cycogs.tsv
    if CYCOG == int(row['cycog_iid'].split('_')[1]):
        print(row)
        protein_list = row['cycog_genes'].split(',')
label_df = pd.DataFrame(protein_list, columns=['protein_id']) # make a dataframe with a protein_id column
# check out rsplit documentation: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.rsplit.html#pandas.Series.str.rsplit
label_df[['genome_name', 'gene_id']] = label_df['protein_id'].str.rsplit(pat='_', n=1, expand=True) # add new columns to the dataframe, the two components of
# protein_id separated using the underscore

# 3: join genome id on genome name
genomes_df = pd.read_csv(GENOMES, sep='\t') # reading in a new dataframe with genome data
label_df = pd.merge(label_df, genomes_df, left_on='genome_name', right_on='IID', how='left') # merging the two dataframes based on the genome_name in the
# label_df and the IID in the genomes_df, which should match

# 4: join clade on genome id
clade_df = pd.read_csv(CLADE_MAP) # read in new dataframe with genome IDs and clade information
label_df = pd.merge(label_df, clade_df, left_on='IMG_ID', right_on='IMGGenomeID', how='left') # merge dataframes matching IMG_ID with IMGGENOMEID

label_df

In [None]:
# write a tsv with all of the genome names as leaf IDs and the label as a universal shape
cycog_check = f"{CYCOG}-check.tsv"
with open(cycog_check, 'w', newline='') as outfile:  # Use 'w' mode to create a new file
    writer = csv.DictWriter(outfile, fieldnames=['ID', 'LABEL'], delimiter='\t')
    writer.writeheader()  # Write the header once at the beginning
    for genome in label_df['genome_name']:
        writer.writerow({'ID': genome, 'LABEL': '1'})

In [None]:
# write small files to use in eventual output file
with open("lgnd.txt", "w") as f:
    f.write(f'DATASET_LABEL\t{CYCOG}')
with open("title.txt", "w") as g:
    g.write(f'LEGEND_TITLE\t{CYCOG}')

In [None]:
file_list = ['genelabel_temp1.txt', 'lgnd.txt', 'genelabel_temp3.txt', 'title.txt', 'genelabel_temp5.txt', cycog_check]
output_file = os.path.join(f'data/{CYCOG}/', f'{CYCOG}_labels.txt')
with open(output_file, 'w') as outfile:
    for fname in file_list:
        with open(fname, 'r') as infile:
            outfile.write(infile.read())
            outfile.write('\n')
os.remove('lgnd.txt')
os.remove('title.txt')
os.remove(cycog_check)

# Prune genomic trees to only include branches in the Berube et al (2018) reference genomes
Below this point, code is not to be continuously used. This code is meant to be used once or a couple times (with small edits) to prune given trees or generate clade labels for species trees.

## Inputs: 
- Newick files of genome phylogenies (from Paul)
- Names of genomes in CyCOGv6
- Clades of genomes in CyCOGv6

## Outputs:
- Pruned newick files of genome phylogenies
- Label files for visualizing pruned phylogenies


In [3]:
# input files

GENOMES = 'data/0/cycogsgenomes.tsv'
CLADE_MAP = 'data/0/updated-genome-clades.csv'

In [4]:
# read in data

genomes_df = pd.read_csv(GENOMES, sep='\t')
clade_df = pd.read_csv(CLADE_MAP)
# merge dataframes matching IMG_ID with IMGGENOMEID
label_df = pd.merge(genomes_df, clade_df, left_on='IMG_ID', right_on='IMGGenomeID', how='left') 

label_df

Unnamed: 0,IID,GROUP,IMG_ID,TYPE,JGI_GENOMEPORTAL_NAME,Completeness,IMGGenomeID,UpdatedIMGGenomeID,Clade
0,AG-311-D23,Prochlorococcus,2716884681,SAG,Uncultured_Prochlorococcus_sp._AG-311-D23,72.96,2.716885e+09,2.716885e+09,LLI
1,AG-311-I02,Prochlorococcus,2716884682,SAG,Uncultured_Prochlorococcus_sp._AG-311-I02,11.16,2.716885e+09,2.716885e+09,LLI
2,AG-311-I09,Prochlorococcus,2716884683,SAG,Uncultured_Prochlorococcus_sp._AG-311-I09,47.41,2.716885e+09,2.716885e+09,HLI
3,AG-311-J05,Prochlorococcus,2716884684,SAG,Uncultured_Prochlorococcus_sp._AG-311-J05,34.64,2.716885e+09,2.716885e+09,HLI
4,AG-311-J23,Prochlorococcus,2716884685,SAG,Uncultured_Prochlorococcus_sp._AG-311-J23,77.90,2.716885e+09,2.716885e+09,LLI
...,...,...,...,...,...,...,...,...,...
765,S-ShM2,Virus,651703106,ISOLATE,Synechococcus_phage_S-ShM2,0.00,,,
766,Syn19,Virus,651703107,ISOLATE,Synechococcus_phage_Syn19,0.00,,,
767,Syn5,Virus,641201056,ISOLATE,Synechococcus_phage_Syn5,0.00,,,
768,metaG-MbCM1,Virus,2595698410,ISOLATE,Synechococcus_phage_metaG-MbCM1,0.00,,,


In [5]:
# read in the genome phylogenies

hl_tree = Phylo.read('data/0/phylogeny/trees/HLprochlorococcus_5PERCto424cycogs_50percgaps_100boot_barnacle.nwk', 'newick')
ll_tree = Phylo.read('data/0/phylogeny/trees/LLprochlorococcus_5PERCto424cycogs_50percgaps_100boot_barnacle.nwk', 'newick')
syn_tree = Phylo.read('data/0/phylogeny/trees/synechococcus_5PERCto424cycogs_50percgaps_100boot_barnacle.nwk', 'newick')


In [9]:
# prune leaves from tree that aren't in CyCOGv6
# Phylo submodule of biopython: https://biopython.org/wiki/Phylo
# Tree.prune() method: https://biopython.org/docs/dev/api/Bio.Phylo.BaseTree.html#Bio.Phylo.BaseTree.TreeMixin.prune
# tutorial that mentions prune(): https://biopython-tutorial.readthedocs.io/en/latest/notebooks/13%20-%20Phylogenetics%20with%20Bio.Phylo.html

# Make a deep copy of the syn_tree
tree_copy = copy.deepcopy(ll_tree) # could be ll_tree, hl_tree or syn_tree

# Prune the tree based on the condition
for leaf in tree_copy.get_terminals():
    if leaf.name not in label_df['IID'].tolist():
        tree_copy.prune(leaf.name)

# Define the output file path
output_file = 'data/0/phylogeny/trees/ll_pruned.newick' # change basename if needed

# Write the pruned tree to the file in Newick format
Phylo.write(tree_copy, output_file, 'newick')

print(f'Tree saved to {output_file}')


Tree saved to data/0/phylogeny/trees/ll_pruned.newick


# Generate label file for clades

In [None]:
# make lists of clades in each tree

genome_lists = {
    'HL': [], 'LL':[], 'Syn': []
}
for group in ['HL', 'LL', 'Syn']:
    # pull out appropriate tree
    tree = {'HL': hl_tree, 'LL':ll_tree, 'Syn': syn_tree}[group]
    # get list of genome names (leaves)
    genomes = []
    for leaf in tree.get_terminals():
        genomes.append(leaf.name)
    genome_lists[group] = genomes

syn_clades = label_df[label_df.IID.isin(genome_lists['Syn'])].Clade.unique()
ll_clades = label_df[label_df.IID.isin(genome_lists['LL'])].Clade.unique()
hl_clades = label_df[label_df.IID.isin(genome_lists['HL'])].Clade.unique()

In [49]:
# use line below to remove clades that don't belong in the taxa (there initially as an outgroup)
# hl_clades = np.delete(hl_clades, 5)

hl_clades

array(['HLI', 'HLIII', 'HLII', 'HLVI', 'HLII.HLVI', 'HLIV'], dtype=object)

In [12]:
# universal color legend so clades aren't mixed up between trees
clade_lgnd = {
    "5.2": "#003D30", "5.3": "#00E5F8", "5.1A-CRD2": "#5A0A33", "5.1A-II": "#005745", "5.1A-III": "#810D49", "5.1A-IV": "#00735C",
    "5.1A-UC-A-EnvC": "#AB0D61", "5.1A-unclassified": "#009175", "5.1A-WPC1": "#D80D7B", "5.1B-CRD1": "#00AF8E", "5.1B-I": "#FF2E95", 
    "5.1B-IX": "#00CBA7", "5.1B-V": "#FF78AD", "5.1B-VI": "#00EBC1", "5.1B-VIII": "#FFACC6", "HLI": "#86FFDE", "HLII": "#FFD7E1",
    "HLII.HLVI": "#00306F", "HLIII": "#460B70", "HLIII.HLIV.HLV": "#00489E", "HLIV": "#6B069F", "HLVI": "#005FCC", "LLI": "#8E06CD",
    "LLI.LLVIII": "#0079FA", "LLI.LLIII": "#B40AFC", "LLIV": "#009FFA", "LLVII": "#ED0DFD", "LLVIII": "#00C2F9", "Blank": "#FF66FD",
    "": "#FF66FD", "LLII.LLIII": "#CDE494", "Unclassified": "#FF66FD", "AMZ-II": "#AA8F66"
}

##### To generate clade labels, this code below can be re-ran for the three groups of cyanobacteria.

In [51]:
# making a legend with only necessary clades. Can overwrite variable when done with previous version.
filtered_clade_lgnd = {key: clade_lgnd[key] for key in hl_clades if key in clade_lgnd}

print(filtered_clade_lgnd)

{'HLI': '#86FFDE', 'HLIII': '#460B70', 'HLII': '#FFD7E1', 'HLVI': '#005FCC', 'HLII.HLVI': '#00306F', 'HLIV': '#6B069F'}


In [52]:
output_matches = "species_clade.tsv"
template_file = "clade_template_fin.txt"
marker = "clades"
f = open("ones.txt", "w")
g = open("clades.txt", "w")
h = open("labels.txt", "w")
f.write('LEGEND_SHAPES\t')
for key, value in filtered_clade_lgnd.items():
    f.write('1\t')
    g.write(f'{key}\t')
    h.write(f'{value}\t')
f.close()
g.close()
h.close()

f = open("ones.txt", "a+")
g = open("clades.txt", "r")
h = open("labels.txt", "r")

f.write(f'\nLEGEND_LABELS\t{g.read()}')
f.write(f'\nLEGEND_COLORS\t{h.read()}')

# Close all files
f.close()
g.close()
h.close()
os.remove('clades.txt')
os.remove('labels.txt')
with open('clade-template1.txt', 'r') as clade1:
    content1 = clade1.read()
with open('ones.txt', 'r') as clade2:
    content2 = clade2.read()
with open('clade-template3.txt', 'r') as clade3:
    content3 = clade3.read()

# Concatenate the contents with newline characters
cladestogether = content1 + "\n" + content2 + '\n' + content3

# Write the combined content to a new file
with open('clade_template_fin.txt', 'w') as template_clade:
    template_clade.write(cladestogether)   

In [53]:
# Use mapping_df to make tsv with leaf_IDs and clade or taxa labels
with open(output_matches, 'w', newline='') as outfile:  # Use 'w' mode to create a new file
    writer = csv.DictWriter(outfile, fieldnames=['ID', 'LABEL'], delimiter='\t')
    writer.writeheader()  # Write the header once at the beginning

    for index, row in label_df.iterrows():
        leaf_id = row['IID']
        clade = row['Clade']
        if clade in hl_clades:
            if pd.notna(clade):  # Check if clade is not NaN before using string methods
                value = clade_lgnd.get(clade, '#FF66FD')  # Use default color if clade not found
            else:
                value = "#CA6702"  # Handle NaN clade case
    
            writer.writerow({'ID': leaf_id, 'LABEL': value})

print(output_matches, "generated successfully.")


syn_clade.tsv generated successfully.


In [54]:
# Join previously made tsv with template file to complete annotation file for iTOL
# Read the content of the output file
output_template_file = ('hl_species_clades.txt')
with open(output_matches, 'r') as output_f:
    output_content = output_f.read()

# Read the content of the template file and append the output content
with open(template_file, 'r') as template_f:
    template_content = template_f.read()
    # Add a newline character between the template and output content
    combined_content = template_content + "\n" + output_content

# Write the combined content to a new file
with open(output_template_file, 'w') as combined_f:
    combined_f.write(combined_content)

os.remove('ones.txt')
os.remove('clade_template_fin.txt')

print("annotation file generated successfully:", output_template_file, '\n')
os.remove(output_matches)
print ("Old outputs removed to clean directory. To run this block again generate a new output_matches file.") 
# output_matches file used to produce this block's output is removed for directory cleaning.



annotation file generated successfully: hl_species_clades.txt 

Old outputs removed to clean directory. To run this block again generate a new output_matches file.
