This notebook is made to check the presence of CyCOGs within species trees of groups of cyanobacteria. The other notebook makes gene trees, this notebook makes labels to check the gene within premade species trees.

In [1]:
from matplotlib import pyplot as plt
import os
import numpy as np
import pandas as pd
import csv

In [38]:
# Pick CyCOG to generate labels for
CYCOG = 60001883

In [39]:
# Set dependencies
REFS = '../data/genomes/'
CYCOGS = 'data/0/serralysin_cycog_references.csv'
CYCOG_LIST = 'data/0/cycogs.tsv'
GENOMES = 'data/0/cycogsgenomes.tsv'
CLADE_MAP = 'data/0/updated-genome-clades.csv'
SEQ = "faa"

In [4]:
# make dataframe containing information about the CyCOG

# 1 & 2: extract protein IDs and associated genome names for all proteins in the CyCOG
cycog_df = pd.read_csv(CYCOG_LIST, sep="\t")
protein_ids = [] # empty list to store protein IDs
for i, row in cycog_df.iterrows(): # searches through rows in cycogs.tsv
    if CYCOG == int(row['cycog_iid'].split('_')[1]):
        print(row)
        protein_list = row['cycog_genes'].split(',')
label_df = pd.DataFrame(protein_list, columns=['protein_id']) # make a dataframe with a protein_id column
# check out rsplit documentation: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.rsplit.html#pandas.Series.str.rsplit
label_df[['genome_name', 'gene_id']] = label_df['protein_id'].str.rsplit(pat='_', n=1, expand=True) # add new columns to the dataframe, the two components of
# protein_id separated using the underscore

# 3: join genome id on genome name
genomes_df = pd.read_csv(GENOMES, sep='\t') # reading in a new dataframe with genome data
label_df = pd.merge(label_df, genomes_df, left_on='genome_name', right_on='IID', how='left') # merging the two dataframes based on the genome_name in the
# label_df and the IID in the genomes_df, which should match

# 4: join clade on genome id
clade_df = pd.read_csv(CLADE_MAP) # read in new dataframe with genome IDs and clade information
label_df = pd.merge(label_df, clade_df, left_on='IMG_ID', right_on='IMGGenomeID', how='left') # merge dataframes matching IMG_ID with IMGGENOMEID

label_df

cycog_iid                                                    CyCOG_60001883
cycog_num_taxa                                                          127
cycog_num_genes                                                         507
cycog_num_duplications                                                  380
cycog_num_pro                                                           495
cycog_num_syn                                                            12
cycog_num_phage                                                           0
cycog_cns_product                     protein of unknown function (DUF4214)
cycog_genes               MIT1342_2682247553,MIT1214_2684587420,AG-402-K...
Name: 1882, dtype: object


Unnamed: 0,protein_id,genome_name,gene_id,IID,GROUP,IMG_ID,TYPE,JGI_GENOMEPORTAL_NAME,Completeness,IMGGenomeID,UpdatedIMGGenomeID,Clade
0,MIT1342_2682247553,MIT1342,2682247553,MIT1342,Prochlorococcus,2681812950,ISOLATE,Prochlorococcus_sp._MIT1342,100.00,2681812950,2846840728,LLIV
1,MIT1214_2684587420,MIT1214,2684587420,MIT1214,Prochlorococcus,2681813567,ISOLATE,Prochlorococcus_sp._MIT1214,99.18,2681813567,2681813567,LLI
2,AG-402-K04_2717283690,AG-402-K04,2717283690,AG-402-K04,Prochlorococcus,2716884382,SAG,Uncultured_Prochlorococcus_sp._AG-402-K04,73.97,2716884382,2716884382,LLI
3,AG-402-I21_2667725162,AG-402-I21,2667725162,AG-402-I21,Prochlorococcus,2667527261,SAG,Prochlorococcus_sp._AG-402-I21,92.53,2667527261,2667527261,LLI
4,AG-683-C23_2717562322,AG-683-C23,2717562322,AG-683-C23,Synechococcus,2716884628,SAG,Uncultured_Synechococcus_sp._AG-683-C23,74.77,2716884628,2716884628,5.1A-IV
...,...,...,...,...,...,...,...,...,...,...,...,...
502,AG-402-M18_2667745417,AG-402-M18,2667745417,AG-402-M18,Prochlorococcus,2667527273,SAG,Prochlorococcus_sp._AG-402-M18,89.43,2667527273,2667527273,LLI
503,NATL1A_2624155641,NATL1A,2624155641,NATL1A,Prochlorococcus,2623620348,ISOLATE,Prochlorococcus_sp._NATL1A,99.73,2623620348,640069325,LLI
504,AG-402-I20_2667722529,AG-402-I20,2667722529,AG-402-I20,Prochlorococcus,2667527260,SAG,Prochlorococcus_sp._AG-402-I20,90.81,2667527260,2667527260,LLI
505,AS9601_2626308727,AS9601,2626308727,AS9601,Prochlorococcus,2623620959,ISOLATE,Prochlorococcus_sp._AS9601,99.64,2623620959,640069321,HLII


In [34]:
# write a tsv with all of the genome names as leaf IDs and the label as a universal color
cycog_check = f"{CYCOG}-check.tsv"
with open(cycog_check, 'w', newline='') as outfile:  # Use 'w' mode to create a new file
    writer = csv.DictWriter(outfile, fieldnames=['ID', 'LABEL'], delimiter='\t')
    writer.writeheader()  # Write the header once at the beginning
    for genome in label_df['genome_name']:
        writer.writerow({'ID': genome, 'LABEL': '#61C9A8'})

In [35]:
# write small files to use in eventual output file
with open("lgnd.txt", "w") as f:
    f.write(f'DATASET_LABEL\t{CYCOG}')
with open("title.txt", "w") as g:
    g.write(f'LEGEND_TITLE\t{CYCOG}')

In [37]:
# concatenate files referenced in this notebook, remove unnecessary files for directory cleaning
file_list = ['genelabel_temp1.txt', 'lgnd.txt', 'genelabel_temp3.txt', 'title.txt', 'genelabel_temp5.txt', cycog_check]
output_file = os.path.join(f'data/{CYCOG}/', f'{CYCOG}_labels.txt')
with open(output_file, 'w') as outfile:
    for fname in file_list:
        with open(fname, 'r') as infile:
            outfile.write(infile.read())
            outfile.write('\n')
os.remove('lgnd.txt')
os.remove('title.txt')
os.remove(cycog_check)