In [1]:
import os
import numpy as np
import pandas as pd
import csv

In [10]:
genome = 'EQPAC1' # select genome / strain of cyanobacteria to check GIs of

# establish datapaths
datapath99 = '../data/cycogs/cycogsgenomes.tsv'
genomes_df = pd.read_csv(datapath99, sep='\t')
for index, row in genomes_df.iterrows():
    if row['IID'] == genome:
        filetarget = row['IMG_ID']

datapath = ('data/0/1-s2.0-S0092867422015197-mmc2.csv') # genome island coordinates
datapath2 = (f'../data/img_data_cycog6/{filetarget}/{filetarget}.gff') # dependent on genome being searched, location subject to change
datapath3 = ('../data/cycogs/ortholog-metadata.csv') # matches geneID to cycog number
datapath4 = ('data/0/pro2_genes.csv') # alternative dataset for putative island genes

# create dataframes from datapaths
cycogs_df = pd.read_csv(datapath3)
pro2_genes_df = pd.read_csv(datapath4)
# datapath 2 has a file that can't be cleanly converted into a dataframe, so it is read in manually later in the notebook
gi_locations_df = pd.read_csv(datapath)
genome_islands_df = gi_locations_df[gi_locations_df['genome_id'] == f'{genome}']
display(genome_islands_df)

Unnamed: 0,genome_id,contig_id,start,end
861,EQPAC1,EQPAC1_c,324011,368141
862,EQPAC1,EQPAC1_c,1306421,1367066
863,EQPAC1,EQPAC1_c,620007,628294
864,EQPAC1,EQPAC1_c,653156,702361
865,EQPAC1,EQPAC1_c,762683,774491
866,EQPAC1,EQPAC1_c,804400,826315
867,EQPAC1,EQPAC1_c,914549,973394
868,EQPAC1,EQPAC1_c,979511,984734
869,EQPAC1,EQPAC1_c,1060081,1087936
870,EQPAC1,EQPAC1_c,1152146,1214874


In [None]:
# down-select the cluster genes to only those with median weight greater than the threshold

threshold = 0.0

pro2_genes_df = pro2_genes_df

This notebook is derived from methods used in [Hackl et al., 2023](https://www.cell.com/cell/pdf/S0092-8674(22)01519-7.pdf). 

Predicted genomic island coordinates in cyanobacterial genomes: [File 2 of supplemental materials](https://ars.els-cdn.com/content/image/1-s2.0-S0092867422015197-mmc2.xlsx)

Using these hypothetical genomic island coordinates, we aim to cross-reference genes that fall within these coordinates with another dataset categorizing genes overexpressed in high exposure to cyanophages. If the genes are in both datasets, it provides additional evidence that these are 'defense island' genes.

In [11]:

start_points = genome_islands_df.start.to_list()
end_points = genome_islands_df.end.to_list()

island_ids = []

# Open and read the file line by line
with open(datapath2, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) > 3: # skip first row
            start = int(parts[3])
            end = int(parts[4])
            
            # Check if the start and end fall within any of the specified ranges
            for start_point, end_point in zip(start_points, end_points):
                # print ((start, type(start), end_point, type(end_point)))
                if start_point <= start <= end_point or start_point <= end <= end_point:
                # if start >= start_point
                    island_ids.append(parts[8].split('=')[1].split(';')[0])
                    break  # Break the loop once a match is found

island_ids = [int(i) for i in island_ids]
print ("Done :)")

Done :)


In [12]:
# get ortholog IDs of island genes

island_cycogs = cycogs_df[(cycogs_df.GenomeName == genome) & (cycogs_df.GeneID.isin(island_ids))].OrthologID.to_list()

print(island_cycogs)

[60000005, 60000012, 60000013, 60000014, 60000022, 60000029, 60000041, 60000043, 60000058, 60000067, 60000077, 60000133, 60000170, 60000185, 60000193, 60000194, 60000208, 60000213, 60000215, 60000220, 60000228, 60000243, 60000264, 60000268, 60000312, 60000392, 60000400, 60000426, 60000453, 60000460, 60000472, 60000478, 60000487, 60000488, 60000531, 60000542, 60000555, 60000558, 60000622, 60000640, 60000650, 60000668, 60000675, 60000791, 60000821, 60000850, 60000905, 60000947, 60000971, 60001206, 60001333, 60001339, 60001542, 60002384]


In [13]:
# get island genes that are also in the pro2 cluster
semi_confirmed = []
pro2_genes_df = pro2_genes_df.sort_values(by=['Ortholog'])
pro2_orthologs = pro2_genes_df['Ortholog']
for ortholog in pro2_orthologs:
    if ortholog in island_cycogs:
        semi_confirmed.append(ortholog)
print (semi_confirmed)

[]


In [14]:
# look at pro2 genes in genomic islands

pro2_genes_df[pro2_genes_df.Ortholog.isin(semi_confirmed)].sort_values('Ortholog', ascending=True)

Unnamed: 0,Ortholog,MedianWeight,MeanWeight,BootstrapSupport,Annotation


In [15]:
# pro2 genes in genome

cluster_genes = pro2_genes_df[pro2_genes_df.MedianWeight.gt(0)].Ortholog.to_list()

genome_cluster_cycogs = cycogs_df[(cycogs_df.GenomeName == genome) & (cycogs_df.OrthologID.isin(cluster_genes))]

print(len(genome_cluster_cycogs.OrthologID.unique()))
genome_cluster_cycogs

32


Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation
16182,EQPAC1_2608230155,60000015,EQPAC1,2608230155,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
16263,EQPAC1_2608230211,60000015,EQPAC1,2608230211,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
16493,EQPAC1_2608230154,60000015,EQPAC1,2608230154,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
16714,EQPAC1_2608231287,60000015,EQPAC1,2608231287,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
28477,EQPAC1_2608230783,60000028,EQPAC1,2608230783,Protein of unknown function (DUF3764)
28618,EQPAC1_2608230322,60000028,EQPAC1,2608230322,Protein of unknown function (DUF3764)
41038,EQPAC1_2608230213,60000046,EQPAC1,2608230213,UDP-glucuronate 4-epimerase
41064,EQPAC1_2608230215,60000046,EQPAC1,2608230215,UDP-glucuronate 4-epimerase
41505,EQPAC1_2608230206,60000046,EQPAC1,2608230206,UDP-glucuronate 4-epimerase
41701,EQPAC1_2608230156,60000046,EQPAC1,2608230156,UDP-glucuronate 4-epimerase


From here, the code is intended to work with the number of hits each "cluster cycog" has in each genome in our dataset. The cluster cycogs are the genes or orthologs that have been overexpressed in regions of high cyanophage abundance.

In [None]:
genome_ortholog_dict = {}
genome_list = genomes_df.IID.tolist()
cluster_genes = pro2_genes_df[pro2_genes_df.MedianWeight.gt(0)].Ortholog.to_list()

for genome in genome_list:
    genome_cluster_cycogs = cycogs_df[(cycogs_df.GenomeName == genome) & (cycogs_df.OrthologID.isin(cluster_genes))]
    ortholog_counts = genome_cluster_cycogs['OrthologID'].value_counts().to_dict()
    genome_ortholog_dict[genome] = ortholog_counts

# Step 2: Convert the dictionary of dictionaries to a DataFrame
df1 = pd.DataFrame.from_dict(genome_ortholog_dict, orient='index').fillna(0)
df1 = df1.transpose()
# The DataFrame 'df' now has genomes as the index, orthologIDs as columns, and the counts as the data points
display(df1)

df1.to_csv('../data/out.csv', index=True)