In [2]:
import os
import numpy as np
import pandas as pd
import csv

In [46]:
genome = 'MED4' # select genome / strain of cyanobacteria to check GIs of

# establish datapaths
datapath99 = '../data/cycogs/cycogsgenomes.tsv'
genomes_df = pd.read_csv(datapath99, sep='\t')
for index, row in genomes_df.iterrows():
    if row['IID'] == genome:
        filetarget = row['IMG_ID']

datapath = ('data/0/1-s2.0-S0092867422015197-mmc2.csv') # genome island coordinates
datapath2 = (f'../data/tool_test_data/genome-data-for-jonah/{filetarget}/{filetarget}.gff') # dependent on genome being searched, location subject to change
datapath3 = ('../data/cycogs/ortholog-metadata.csv') # matches geneID to cycog number
datapath4 = ('data/0/pro2_genes.csv') # alternative dataset for putative island genes

# create dataframes from datapaths
cycogs_df = pd.read_csv(datapath3)
pro2_genes_df = pd.read_csv(datapath4)
# datapath 2 has a file that can't be cleanly converted into a dataframe, so it is read in manually later in the notebook
gi_locations_df = pd.read_csv(datapath)
genome_islands_df = gi_locations_df[gi_locations_df['genome_id'] == f'{genome}']
display(genome_islands_df)

Unnamed: 0,genome_id,contig_id,start,end
871,MED4,MED4_1,323821,368220
872,MED4,MED4_1,1306399,1367064
873,MED4,MED4_1,620091,628378
874,MED4,MED4_1,653240,702447
875,MED4,MED4_1,762769,774459
876,MED4,MED4_1,804368,826283
877,MED4,MED4_1,914516,973394
878,MED4,MED4_1,979511,984734
879,MED4,MED4_1,1060080,1087933
880,MED4,MED4_1,1152143,1214874


This notebook is derived from methods used in [Hackl et al., 2023](https://www.cell.com/cell/pdf/S0092-8674(22)01519-7.pdf). 

Predicted genomic island coordinates in cyanobacterial genomes: [File 2 of supplemental materials](https://ars.els-cdn.com/content/image/1-s2.0-S0092867422015197-mmc2.xlsx)

Using these hypothetical genomic island coordinates, we aim to cross-reference genes that fall within these coordinates with another dataset categorizing genes overexpressed in high exposure to cyanophages. If the genes are in both datasets, it provides additional evidence that these are 'defense island' genes.

In [47]:
start_points = genome_islands_df[['start']]  # series of start points
end_points = genome_islands_df[['end']] # series of end points
start_points = start_points.values.tolist()
end_points = end_points.values.tolist()

island_ids = []

# Open and read the file line by line
with open(datapath2, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) > 3:
            start = int(parts[3])
            end = int(parts[4])
            
            # Check if the start and end fall within any of the specified ranges
            for start_point, end_point in zip(start_points, end_points):
                # print ((start, type(start), end_point, type(end_point)))
                if start >= int(start_point[0]) and end <= int(end_point[0]):
                    island_ids.append(parts[8].split('=')[1].split(';')[0])
                    break  # Break the loop once a match is found

island_ids = [eval(i) for i in island_ids]
print ("Done :)")

Done :)


In [48]:
island_cycogs = []
for index, row in cycogs_df.iterrows():
    if row['GenomeName'] == f'{genome}':
        if row['GeneID'] in island_ids:
            island_cycogs.append(row['OrthologID'])
print (island_cycogs)

[60000001, 60000002, 60000009, 60000011, 60000015, 60000015, 60000015, 60000016, 60000016, 60000018, 60000024, 60000027, 60000028, 60000030, 60000030, 60000032, 60000034, 60000039, 60000040, 60000042, 60000045, 60000045, 60000046, 60000049, 60000050, 60000053, 60000057, 60000059, 60000059, 60000068, 60000075, 60000080, 60000081, 60000084, 60000085, 60000087, 60000088, 60000095, 60000100, 60000103, 60000104, 60000107, 60000110, 60000111, 60000112, 60000115, 60000119, 60000124, 60000125, 60000126, 60000127, 60000135, 60000136, 60000137, 60000144, 60000149, 60000150, 60000155, 60000159, 60000160, 60000161, 60000163, 60000166, 60000167, 60000180, 60000181, 60000184, 60000186, 60000187, 60000196, 60000203, 60000210, 60000216, 60000218, 60000221, 60000226, 60000231, 60000240, 60000241, 60000245, 60000248, 60000251, 60000254, 60000269, 60000273, 60000275, 60000279, 60000284, 60000285, 60000290, 60000301, 60000305, 60000313, 60000314, 60000315, 60000319, 60000320, 60000326, 60000329, 60000330,

In [49]:
semi_confirmed = []
pro2_genes_df = pro2_genes_df.sort_values(by=['Ortholog'])
pro2_orthologs = pro2_genes_df['Ortholog']
for ortholog in pro2_orthologs:
    if ortholog in island_cycogs:
        semi_confirmed.append(ortholog)
print (semi_confirmed)

[60000015, 60000028, 60000030, 60000034, 60000046, 60000053, 60000112, 60000126, 60000218, 60000279, 60000459, 60000467, 60000653, 60000689, 60000845, 60000878, 60001113, 60001223, 60001269, 60001332, 60001342, 60001425, 60001593, 60001636, 60001942]
