# Targeted Cluster Analysis using cBLASTER

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import glob
from Bio import Phylo
from Bio import SeqIO
import math

## Proejct initiation stage
Define project name, rest will be created automatically when code is exicuted in the working directory

In [2]:
'''Define project name'''
project_name="Streptomyces_Azoxy_Distribution"
'''Define location for cblaster database'''
dir_cblaster_db = "custom_db/"+"NCBI_P8A2_20230216/"

'''Defult directory layout'''
input_dir="input/"+project_name+"/"
output_dir="output/"+project_name+"/"
dir_clinker = output_dir+"clinker/"
dir_cblaster = output_dir+"cblaster/"
dir_iTOL = output_dir+"iTOL/"

'''Automated creation of directories'''
directories = (input_dir,output_dir, dir_clinker, dir_cblaster, dir_iTOL)
'''Check whether the specified path exists or not. If dir doesnt exist, it is created'''
for path in directories:
   isExist = os.path.exists(path)
   if not isExist:
      # Create a new directory because it does not exist
      os.makedirs(path)
      print("Directory is created - "+path)

### Insert relevent files into the project input directory
Following files are accepted:
- Files containing complete gene cluster which will be passed through clinker to find gene pairs: ***.gbk** and/or ***.gb** format
- any protein fasta file that should be included in analysis without gene grouping, must contain only cluster specific genes as they will be latter added to the list of core/tailoring genes as singletons: protein ***.fasta** format.
- List of protein IDs that have been characterised, list will be created automatically from fasta files in the folder, or can be done manually. The list will be checked against clinker created gene groups and cluster specific gene ids will be added to the list of groups --> **characterised_protein_IDs.csv** file


## Definitions used inside the project, generated by chatGPT

In [3]:
def export_protein_ids(gbk_file):
    protein_ids = []

    # Read the GenBank file and extract protein IDs
    with open(gbk_file, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            for feature in record.features:
                if feature.type == "CDS" and "protein_id" in feature.qualifiers:
                    protein_ids.append(feature.qualifiers["protein_id"][0])

    return protein_ids

def genbank_to_fasta(input_file, output_file):
    # Open the input GenBank file
    with open(input_file, "r") as handle:
        # Parse the GenBank file and store the records
        records = list(SeqIO.parse(handle, "genbank"))

    # Open the output FASTA file
    with open(output_file, "w") as handle:
        # Iterate over the records and write protein sequences to the FASTA file
        for record in records:
            for feature in record.features:
                if feature.type == "CDS":
                    if "translation" in feature.qualifiers:
                        protein_sequence = feature.qualifiers["translation"][0]
                        handle.write(f">{feature.qualifiers['protein_id'][0]}\n")
                        handle.write(f"{protein_sequence}\n")

def rgb_to_hex(rgb):
    # Validate RGB values
    if len(rgb) != 3:
        raise ValueError("RGB values must be a list or tuple of length 3.")
    # Convert RGB to hexadecimal
    hex_color = '#{:02x}{:02x}{:02x}'.format(*rgb)
    return hex_color


## Creation of gene groups using clinker and populating it using list of cluster specific genes eather as protein.fasta file or list of protein IDs in provided .csv file

In [4]:
'''Running clinker on *.gb and *.gbk files in the project input directory. Outputs are generated in clinker folder located in output project directory.'''
if not os.listdir(dir_clinker):
    os.environ["dir_clinker"] = dir_clinker
    os.environ["input_dir"] = input_dir
    !bash shell_script_clinker.sh
else:
    print("Directory is not empty. Skipping the step.")

Directory is not empty. Skipping the step.


In [5]:
# Code written together with thom to create the gene matrix automatically
df_genes_list = pd.read_csv(dir_clinker +'alignments.csv',header=None,usecols=[0,1]).dropna()
df_genes_list.rename( columns={0 :'Pair_gene_1'}, inplace=True )
df_genes_list.rename( columns={1 :'Pair_gene_2'}, inplace=True )
# Create an empty list
genes_list =[]
# Iterate over each row
for rows in df_genes_list.itertuples():
    # Create list for the current row
    each_pair =[rows.Pair_gene_1, rows.Pair_gene_2]
    # append the list to the final list
    genes_list.append(each_pair)

gene_group_list = [{genes_list[0][0]}]
for genes in genes_list:
    genes_added = False
    for gene_group in gene_group_list:
        if genes[0] in gene_group or genes[1] in gene_group:
            gene_group.add(genes[1])
            gene_group.add(genes[0])
            genes_added = True
    if genes_added == False:
        gene_group_list.extend([{genes[0]}])
        for gene_group in gene_group_list:
            if genes[0] in gene_group or genes[1] in gene_group:
                gene_group.add(genes[1])
                gene_group.add(genes[0])
sorted(gene_group_list)
gene_group_list.sort(key = len, reverse=True)

gene_group_list

[{'AAN10236.1', 'PRJNA935111:PU648_09280', 'WP_037654301.1'},
 {'AAN10237.1', 'PRJNA935111:PU648_09235', 'WP_037654320.1'},
 {'AAN10238.1', 'PRJNA935111:PU648_09285', 'WP_037654315.1'},
 {'AAN10239.1', 'PRJNA935111:PU648_09255', 'WP_154822324.1'},
 {'AAN10240.1', 'PRJNA935111:PU648_09275', 'WP_037654313.1'},
 {'AAN10241.1', 'WP_037654308.1'},
 {'AAN10246.1', 'WP_037654300.1'},
 {'AAN10247.1', 'WP_037654338.1'},
 {'AAN10248.1', 'WP_125207584.1'},
 {'AAN10249.1', 'WP_037654307.1'},
 {'PRJNA935111:PU648_09245', 'WP_037654318.1'},
 {'PRJNA935111:PU648_09250', 'WP_037654316.1'},
 {'PRJNA935111:PU648_09270', 'WP_051850081.1'}]

### Adding singletons to the generated pairs from fasta file and custom .csv list

In [6]:
'''Addition of known cluster specific tailoring genes to the gene list is done here, these genes are essential for specific cluster that are not shared between the clusters'''
'''The protein IDs are taken from protein fasta file of characterized enzymes for each BGC, list must be provided manually as input'''

'''firstly we translate the protein_fasta_file into csv list of protein IDs'''
'''Note: this can be skipped if file is manually populated with IDs'''
# Create an empty list to store sequence names from fasta file
protein_sequence_ids = []

# Loop through each fasta file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".fasta"):
        # Open and read the current FASTA file
        with open(os.path.join(input_dir, filename), "r") as fasta_file:
            for record in SeqIO.parse(fasta_file, "fasta"):
                # Extract sequence name and add to the list
                protein_sequence_id = record.id
                protein_sequence_ids.append(protein_sequence_id)

'''Add protein IDs from the fasta file to predefined core genes'''
df_fasta_protein_IDs = pd.DataFrame({"protein_id": protein_sequence_ids})
df__manual_protein_IDs = pd.read_csv(input_dir+"characterised_protein_IDs.csv")

df_protein_ids_merged = pd.merge(df_fasta_protein_IDs, df__manual_protein_IDs, on='protein_id', how='outer')

In [7]:
'''Filtering of singelton sets from clinker detected pairs and adding unique ones from df_protein_ids_merged to the gene list'''
# Initialize an empty list to store the singleton set
gene_singleton_list_of_sets = []
# Iterate through the sequence names in the DataFrame and create singleton set
for protein_id in df_protein_ids_merged["protein_id"]:
    singleton_set = {protein_id}
    gene_singleton_list_of_sets.append(singleton_set)

'''make clinker detected gene pair list of sets into single set list to filter out singleton set from existing values'''
clinker_protein_id_set = set()
for group in gene_group_list:
    for gene in group:
        clinker_protein_id_set.add(gene)

'''filtering of singelton set and joining them to clinker pairs'''
for singelton_set in gene_singleton_list_of_sets:
    filtered_set = singelton_set.difference(clinker_protein_id_set)
    if filtered_set:
        gene_group_list.append(filtered_set)


print("Overview of protein pairs")
gene_group_list

Overview of protein pairs


[{'AAN10236.1', 'PRJNA935111:PU648_09280', 'WP_037654301.1'},
 {'AAN10237.1', 'PRJNA935111:PU648_09235', 'WP_037654320.1'},
 {'AAN10238.1', 'PRJNA935111:PU648_09285', 'WP_037654315.1'},
 {'AAN10239.1', 'PRJNA935111:PU648_09255', 'WP_154822324.1'},
 {'AAN10240.1', 'PRJNA935111:PU648_09275', 'WP_037654313.1'},
 {'AAN10241.1', 'WP_037654308.1'},
 {'AAN10246.1', 'WP_037654300.1'},
 {'AAN10247.1', 'WP_037654338.1'},
 {'AAN10248.1', 'WP_125207584.1'},
 {'AAN10249.1', 'WP_037654307.1'},
 {'PRJNA935111:PU648_09245', 'WP_037654318.1'},
 {'PRJNA935111:PU648_09250', 'WP_037654316.1'},
 {'PRJNA935111:PU648_09270', 'WP_051850081.1'},
 {'KP687747'},
 {'KP687746'},
 {'KP687745'},
 {'KP687744'},
 {'KP687743'},
 {'KP687742'},
 {'KP687741'},
 {'KP687740'},
 {'KP687739'},
 {'KP687738'},
 {'KP687737'},
 {'KP687736'},
 {'KP687735'},
 {'KP687734'},
 {'KP687733'},
 {'AAN10242.1'},
 {'AAN10243.1'},
 {'AAN10244.1'},
 {'PRJNA935111:PU648_09290'},
 {'PRJNA935111:PU648_09240'},
 {'PRJNA935111:PU648_09260'},
 {'PR

In [8]:
'''To have better overview, for parameter estimate, the number of genes contained in each group is calculated below'''
from collections import defaultdict
counts = defaultdict(int)
for main_group in gene_group_list:
    counts[sum(gene in group for group in gene_group_list for gene in main_group)] += 1
counts = dict(counts)

print("Number of genes (which are contained by) number of groups")
for i in range(1, max(counts)+1):
    print(counts.get(i, 0), i)

Number of genes (which are contained by) number of groups
23 1
8 2
5 3


### Parameter selection for filtering of cblaster results

In [9]:
# dependencies:
'''These variables are used for filtering of cblaster table based on list of core genes and list of tailoring genes shared between input clusters'''
min_genes_shared_to_be_core = 3         # Largest number of genes in a group
minimum_core_genes = 4                  # Set to 4 as max is 5, we are not sure if all 5 are required
minimum_gene_hits = 5           # set to 5

In [10]:
'''Genes are classified in three groups, one group contains all of the rarget genes, the secound group contains only core genes and the final group contains all genes that are not core.'''
target_gene_list = []
core_genes_list = []
tailoring_genes_list = []
for group in gene_group_list:
    for gene in group:
            target_gene_list.append(gene)
    if len(group) == min_genes_shared_to_be_core:
        for core_gene in group:
            core_genes_list.append(core_gene)
    else:
        for tailoring_gene in group:
            tailoring_genes_list.append(tailoring_gene)
target_gene_list.sort()
core_genes_list.sort()
tailoring_genes_list.sort()
print("Target gene list:")
print(target_gene_list)
print("Core gene list:")
print(core_genes_list)
print("Tailoring gene list:")
print(tailoring_genes_list)

Target gene list:
['AAN10236.1', 'AAN10237.1', 'AAN10238.1', 'AAN10239.1', 'AAN10240.1', 'AAN10241.1', 'AAN10242.1', 'AAN10243.1', 'AAN10244.1', 'AAN10246.1', 'AAN10247.1', 'AAN10248.1', 'AAN10249.1', 'KP687733', 'KP687734', 'KP687735', 'KP687736', 'KP687737', 'KP687738', 'KP687739', 'KP687740', 'KP687741', 'KP687742', 'KP687743', 'KP687744', 'KP687745', 'KP687746', 'KP687747', 'PRJNA935111:PU648_09235', 'PRJNA935111:PU648_09240', 'PRJNA935111:PU648_09245', 'PRJNA935111:PU648_09250', 'PRJNA935111:PU648_09255', 'PRJNA935111:PU648_09260', 'PRJNA935111:PU648_09270', 'PRJNA935111:PU648_09275', 'PRJNA935111:PU648_09280', 'PRJNA935111:PU648_09285', 'PRJNA935111:PU648_09290', 'PRJNA935111:PU648_09325', 'PRJNA935111:PU648_09360', 'WP_037654300.1', 'WP_037654301.1', 'WP_037654307.1', 'WP_037654308.1', 'WP_037654313.1', 'WP_037654315.1', 'WP_037654316.1', 'WP_037654318.1', 'WP_037654320.1', 'WP_037654338.1', 'WP_051850081.1', 'WP_125207584.1', 'WP_154822324.1']
Core gene list:
['AAN10236.1', 'AA

In [11]:
'''define prephase code of protein_IDs for each imput BGC manually'''

BGC_protein_ID_codes_dict = {
    'Azodyrecin': 'PRJNA935111:PU648_',
    'Azoxymycin': 'KP',
    'KA57A': 'WP_',
    'Valanimycin': 'AAN'
}

# Printing the dictionary
print(BGC_protein_ID_codes_dict)

{'Azodyrecin': 'PRJNA935111:PU648_', 'Azoxymycin': 'KP', 'KA57A': 'WP_', 'Valanimycin': 'AAN'}


## cblaster

In [12]:
'''all of the input files are combined in one protein fasta file. Following code translates each of the GenBank files to fasta files and merge them into single input file for cblaster'''
'''Translation of gbk to fasta files'''
list_of_GenBank_files = []
for file in os.listdir(input_dir):
    if file.endswith(".gb") or file.endswith(".gbk"):
        list_of_GenBank_files.append(os.path.join(input_dir,file))

list_of_fasta_files = []
for file in os.listdir(input_dir):
    if file.endswith(".gb") or file.endswith(".gbk"):
        file=file.replace('.gbk','.fasta')
        file=file.replace('.gb','.fasta')
        list_of_fasta_files.append(os.path.join(dir_cblaster,file))


for genbank_input, fasta_output in zip(list_of_GenBank_files,list_of_fasta_files):
    genbank_to_fasta(genbank_input, fasta_output)

'''Merge fasta files into one input fasta file for cblaster'''
for file in os.listdir(input_dir):
    if file.endswith(".fasta"):
        list_of_fasta_files.append(os.path.join(input_dir,file))
list_of_fasta_files

fasta_sequences = []
for file in list_of_fasta_files:
    with open(file, 'r') as fasta_file:
        fasta_sequences.extend(SeqIO.parse(fasta_file, 'fasta'))

merged_fasta_file = dir_cblaster+"merged_fasta_file_cblaster_input.fasta"
with open(merged_fasta_file, 'w') as output_fasta:
    SeqIO.write(fasta_sequences, output_fasta, 'fasta')

In [13]:
'''running cblaster'''
if not os.path.exists(dir_cblaster+"cblaster_output.csv"):
    '''parameter import from notebook to bash enviroment'''
    os.environ["input_fasta"] = merged_fasta_file
    os.environ["output"] = dir_cblaster+"cblaster_output"
    os.environ["minimum_hits"] = str(minimum_gene_hits)
    os.environ["cblaster_db"] = dir_cblaster_db
    '''cblaster run'''
    !bash shell_script_cblaster.sh
else:
    print('Directory contains cblaster cblaster_output.csv file. Skipping the step.')

Directory contains cblaster cblaster_output.csv file. Skipping the step.


### Filtering of cblaster output
Firstly we reduce table to contain only marked genes in the **gene_group_list** and we keep only the top result for homologous gene hits from different BGCs, keeping only the cloesest identity match.

Then we assign which BGC the cluster belongs to based on previously manually defined **BGC_protein_ID_codes_dict**

Finally we do filtering of the table by firstly counting up the gene and core gene hits and then filtering it based on the definitions entered in **minimum_core_genes** and **minimum_gene_hits**.

In [14]:
'''Define BGCs that are to be excluded from core gene requirement filtering, meaning they dont have hits in clinker'''
excluded_from_core_req_dict = {
    'Azodyrecin': 'include',
    'Azoxymycin': 'exclude',
    'KA57A': 'include',
    'Valanimycin': 'include'
}

'''define list of genes that must be contained in BGCs excluded from core filtering'''
custom_required_genes_for_non_core_filt = [
    'KP687735'
]
number_of_req_custom_genes = 1

In [15]:
'''load in cblaster output'''
cblaster_output = pd.read_csv(dir_cblaster+"cblaster_output.csv")

'''make blank pandas dataframe containing only gene group lists'''
columns_to_keep = list(cblaster_output)[:5] + target_gene_list
columns_to_drop = [col for col in cblaster_output.columns if col not in columns_to_keep]
df_cblaster = cblaster_output.drop(columns=columns_to_drop)
df_cblaster

'''remove columns that do not have sufficient number of gene hits from target_gene_list'''
df_cblaster['gene hits'] = df_cblaster[target_gene_list].count(axis=1)
df_cblaster = df_cblaster[df_cblaster['gene hits'] >= minimum_gene_hits]

for group in gene_group_list:
    if len(group) >1:
        group_list = list(group)
        max_values = df_cblaster[group_list].max(axis=1)
        for protein_id in group:
            df_cblaster.loc[df_cblaster[protein_id] != max_values, protein_id] = np.nan
df_cblaster = df_cblaster.replace(0, np.nan)

for bgc, protein_ID_key in BGC_protein_ID_codes_dict.items():
    relevant_columns = [column for column in df_cblaster.columns if column.startswith(protein_ID_key)]
    df_cblaster[bgc+' gene hits'] = df_cblaster[relevant_columns].count(axis=1)
    df_cblaster[bgc+' %input'] = df_cblaster[bgc+' gene hits']/len(relevant_columns)

bgc_list = []
for bgc in BGC_protein_ID_codes_dict:
    bgc_list.append(bgc+' %input')
df_cblaster['bgc'] = df_cblaster[bgc_list].idxmax(axis=1)
df_cblaster['bgc'] = df_cblaster['bgc'].str.replace(' %input', '')

'''count number of core genes'''
df_cblaster['core gene hits'] = df_cblaster[core_genes_list].count(axis=1)
'''Removing genomes with insuficient core gene hits from list of genomes that should be filrered, except if bgc is excepted and alternative rule paplies'''
for bgc, dont_perform_core_filtering in excluded_from_core_req_dict.items():
    if dont_perform_core_filtering == 'exclude':
        df_tmp = df_cblaster[df_cblaster['bgc'] == bgc]
        df_tmp['count'] = df_tmp[custom_required_genes_for_non_core_filt].count(axis=1)
        df_tmp = df_tmp[df_tmp['count'] < number_of_req_custom_genes]
        df_cblaster = df_cblaster.drop(df_tmp.index)
    else:
        df_tmp = df_cblaster[df_cblaster['bgc'] == bgc]
        df_tmp = df_tmp[df_tmp['core gene hits'] < minimum_core_genes]
        df_cblaster = df_cblaster.drop(df_tmp.index)


'''Sorting of table'''
df_cblaster = df_cblaster.sort_values(by ='Organism', ascending = True)
df_cblaster = df_cblaster.sort_values(by ='Scaffold', ascending = True)
df_cblaster = df_cblaster.sort_values(by ='gene hits', ascending = False)
df_cblaster = df_cblaster.sort_values(by ='core gene hits', ascending = False)
df_cblaster = df_cblaster.reset_index(drop=True)

'''As for iTOL visualization only 1 cluster can be showed for single node,
we exclude worst hit for genomes with two or more hits and save it in seperate file'''
duplicated_mask = df_cblaster.duplicated('Organism', keep='first')
df_cblaster_excluded = df_cblaster[duplicated_mask]
df_cblaster = df_cblaster[~duplicated_mask]
'''saving tables to *.csv files'''
df_cblaster_excluded.to_csv(dir_cblaster+"df_cblaster_excluded.csv",index=False)
df_cblaster.to_csv(dir_cblaster+"df_cblaster.csv",index=False)

'''Creating additional table where gene rows are grouped in specific groups that will be used later in iTOL visualizations'''
#not done for now
df_cblaster

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['count'] = df_tmp[custom_required_genes_for_non_core_filt].count(axis=1)


Unnamed: 0,Organism,Scaffold,Start,End,Score,AAN10236.1,AAN10237.1,AAN10238.1,AAN10239.1,AAN10240.1,...,Azodyrecin gene hits,Azodyrecin %input,Azoxymycin gene hits,Azoxymycin %input,KA57A gene hits,KA57A %input,Valanimycin gene hits,Valanimycin %input,bgc,core gene hits
0,GCF_012922115.1,JABAQG010000001,10106302,10179481,74.1437,,,,,,...,13,1.0,0,0.000000,0,0.000000,0,0.000000,Azodyrecin,5
1,GCF_008704445.1,NZ_CP023697.1,2399106,2443830,37.0286,,,,,,...,0,0.0,0,0.000000,11,0.846154,1,0.076923,KA57A,5
2,GCF_008705135.1,NZ_CP023694.1,6954730,6992504,57.7660,,,,,,...,0,0.0,0,0.000000,12,0.923077,0,0.000000,KA57A,5
3,GCF_014650675.1,NZ_BMVK01000003,343098,384521,40.0731,,,,,,...,0,0.0,1,0.066667,10,0.769231,1,0.076923,KA57A,5
4,GCF_003112515.1,NZ_BEVZ01000002,430776,447432,27.6654,,,,,,...,0,0.0,0,0.000000,10,0.769231,1,0.076923,KA57A,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,GCF_016876065.1,c00001_NZ_JABZ..,1034874,1047113,24.6673,,,,,,...,0,0.0,14,0.933333,0,0.000000,0,0.000000,Azoxymycin,0
394,GCF_014655715.1,NZ_BNBM01000001,1175745,1203475,12.3384,,,,,,...,0,0.0,2,0.133333,0,0.000000,0,0.000000,Azoxymycin,0
395,GCF_002797835.1,NZ_PGEO01000001,6294577,6307152,26.6909,,,,,,...,0,0.0,15,1.000000,0,0.000000,0,0.000000,Azoxymycin,0
396,GCF_003935055.1,NZ_RHGC01000006,12415,75424,12.3326,,,,,,...,0,0.0,4,0.266667,1,0.076923,0,0.000000,Azoxymycin,0


# iTOL visualization and splitting data into sub-tables
- To visalize the data, create a table where Each cluster is given its own color
    - Group the most similar clusters with XX cutoff to be responsible for production of best mach cBLASTER cluster
        - Probably best to use max cluster hits / with core gene hits to get estimated ratio.
            - This can create problems if input cluster is not part of the diamond database and better would be to use clinker identifited gene coutn. Requires to make table with cblaster_input and clinker hits
    - Low similarity clusters are offcolored to be something similar to target input
    - very-low similarity clusters are graphed as other related clusters.

In [16]:
BGC_RBG_color_dict = {
    'Azodyrecin': '204, 121, 167',
    'Azoxymycin': '0, 158, 115',
    'KA57A': '86, 180, 233',
    'Valanimycin': '213, 94, 0'
}
BGC_RBG_color_dict

{'Azodyrecin': '204, 121, 167',
 'Azoxymycin': '0, 158, 115',
 'KA57A': '86, 180, 233',
 'Valanimycin': '213, 94, 0'}

## color_gradient/color_ranges
The resulting table can be used in iTOL to copy-paste data from excel file into **iTOL color gradiant class**, or **color ranges** which can only be eddited automatically using **iTOL annotation edditor** requiering extra subscription in iTOL.


Create iTOL annotation table

In [17]:
'''Creation of df to be populated with data'''
df_iTOL_cg = pd.DataFrame(columns=['Tree node (bgcflow input file name)','Color (rgba(255, 255, 255, 1.000))','Label'])
'''Populate the table with df_cblaster results'''
df_iTOL_cg['Tree node (bgcflow input file name)'] = df_cblaster['Organism'] 
df_iTOL_cg['Label'] = df_cblaster['bgc']
'''color intensity is the % of genes hits belonging to flagged bgc which is used to change intensity of rgb color'''
color_intensity = df_cblaster[bgc_list].max(axis=1)
#for bgc, rgb in BGC_RBG_color_dict.items():
#    if df_iTOL_cg['Label'] == bgc:
df_iTOL_cg['Color (rgba(255, 255, 255, 1.000))'] = df_iTOL_cg['Label'].map(BGC_RBG_color_dict)
df_iTOL_cg['Color (rgba(255, 255, 255, 1.000))'] = 'rgba(' + df_iTOL_cg['Color (rgba(255, 255, 255, 1.000))'] + ', ' + color_intensity.astype(str) + ')'

'''save file'''
df_iTOL_cg.to_csv(dir_iTOL+"color_gradient_and_color_ranges_iTOL_table.csv",index=False)

## Shape plot
Visualizes the gene group hits of core, tailoring and cluster specific fenes. The size reflext the size reflects the hgihest protein identity score from the gene group.

In [19]:
'''from cblaster table we load in  '''
df_iTOL_sp = pd.DataFrame()
df_iTOL_sp['Tree node ID'] = df_cblaster['Organism']
df_iTOL_sp['Tree node label'] = df_cblaster['Organism']

'''Now we want to assign IDs to the previously defined gene groups and join annotations into these groups'''
gene_groups = {str(i+1).zfill(3)+"-"+str(len(group)).zfill(2): group for i, group in enumerate(gene_group_list)}
'''Add groups as column names to the iTOL df'''
for column_name in list(gene_groups):
    df_iTOL_sp[column_name] = ''

for i, group in gene_groups.items():
    df_iTOL_sp[i] = df_cblaster[list(group)].max(axis=1)
df_iTOL_sp = df_iTOL_sp.fillna(0)

df_iTOL_sp.to_csv(dir_iTOL+"shape_plot_data_iTOL.csv",index=False)

'''upload of this data in itol requires special header that to my knowladge cannot be made in one table, therefore sperate file is created:'''
df_iTOL_cg_header = pd.DataFrame(list(df_iTOL_sp.columns[2:]), columns=['Field labels'])
df_iTOL_cg_header[['Gene group', 'number of genes']] = df_iTOL_cg_header['Field labels'].str.split("-", expand = True)
df_iTOL_cg_header['number of genes'] = df_iTOL_cg_header['number of genes'].astype('int')
'''color coding is done below based on nr of genes in each group'''
hex_codes = []
for gene_number in df_iTOL_cg_header['number of genes']:
    color_strength = gene_number/max(df_iTOL_cg_header['number of genes'])
    R_color = 255 - int(204*color_strength)
    G_color = 255 - int(204*color_strength)
    B_color = 255 - int(204*color_strength)
    color_code = '#' + "".join(hex(val).split("0x")[1].zfill(2) for val in [R_color, G_color, B_color])
    hex_codes.append(color_code)

df_iTOL_cg_header['Field colors'] = hex_codes
df_iTOL_cg_header=df_iTOL_cg_header.drop(columns=['Gene group', 'number of genes']).transpose()

df_iTOL_cg_header.to_csv(dir_iTOL+"shape_plot_header_iTOL.csv",index=False)