In [1]:
import os
import numpy as np
import pandas as pd
import csv

In [15]:
genome = 'MIT0602' # select genome / strain of cyanobacteria to check GIs of

# establish datapaths
datapath99 = '../data/cycogs/cycogsgenomes.tsv'
genomes_df = pd.read_csv(datapath99, sep='\t')
for index, row in genomes_df.iterrows():
    if row['IID'] == genome:
        filetarget = row['IMG_ID']

datapath = ('data/0/1-s2.0-S0092867422015197-mmc2.csv') # genome island coordinates
datapath2 = (f'../data/img_data_cycog6/{filetarget}/{filetarget}.gff') # dependent on genome being searched, location subject to change
datapath3 = ('../data/cycogs/ortholog-metadata.csv') # matches geneID to cycog number
datapath4 = ('data/0/pro2_genes.csv') # alternative dataset for putative island genes

# create dataframes from datapaths
cycogs_df = pd.read_csv(datapath3)
pro2_genes_df = pd.read_csv(datapath4)
# datapath 2 has a file that can't be cleanly converted into a dataframe, so it is read in manually later in the notebook
gi_locations_df = pd.read_csv(datapath)
genome_islands_df = gi_locations_df[gi_locations_df['genome_id'] == f'{genome}']
display(genome_islands_df)

Unnamed: 0,genome_id,contig_id,start,end
4913,MIT0602,MIT0602_c,552796,669355
4914,MIT0602,MIT0602_c,822481,826987
4915,MIT0602,MIT0602_c,877973,918268
4916,MIT0602,MIT0602_c,1084346,1119789
4917,MIT0602,MIT0602_c,1143812,1169496
4918,MIT0602,MIT0602_c,1185359,1235232
4919,MIT0602,MIT0602_c,1336129,1452824


This notebook is derived from methods used in [Hackl et al., 2023](https://www.cell.com/cell/pdf/S0092-8674(22)01519-7.pdf). 

Predicted genomic island coordinates in cyanobacterial genomes: [File 2 of supplemental materials](https://ars.els-cdn.com/content/image/1-s2.0-S0092867422015197-mmc2.xlsx)

Using these hypothetical genomic island coordinates, we aim to cross-reference genes that fall within these coordinates with another dataset categorizing genes overexpressed in high exposure to cyanophages. If the genes are in both datasets, it provides additional evidence that these are 'defense island' genes.

In [16]:
with open(datapath2, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) > 3: # skip first row
            start = int(parts[3])
            end = int(parts[4])
            print (start, end)

2 136
73 843
900 1781
2022 2180
2239 3264
3219 3575
3942 5012
5209 5340
5430 5609
5954 7786
7822 8910
8913 9869
9923 10732
10756 11130
11224 11808
11859 11996
12024 12410
12407 12808
12898 14040
14125 15603
15612 16889
16895 17977
18035 18337
18533 20701
20722 21636
21638 22534
22582 22812
22819 23571
23906 23979
24092 25156
25294 26715
26784 27806
27859 28812
28904 30187
30314 30829
30868 31806
31812 32843
32843 34156
34153 35160
35161 36150
36156 37928
37929 39068
39050 40444
40534 41943
41969 42796
42862 43629
43611 44708
44851 46053
46100 46963
47095 47550
47618 48322
48362 49057
49126 50175
50314 51285
51288 52241
52243 52827
52842 53780
54166 55083
55080 55997
55999 57039
57188 58399
58451 59419
59508 61463
61563 62504
62903 63037
63040 64065
64256 65737
65913 66950
66959 67600
67710 68624
68615 69349
69343 70095
70092 70235
70170 71897
72336 72408
73018 74175
74194 75822
75846 77732
77807 78415
78482 79441
79520 81379
81489 82580
82570 83100
83063 83329
83391 84695
84715 85623
8

In [3]:

start_points = genome_islands_df.start.to_list()
end_points = genome_islands_df.end.to_list()

island_ids = []

# Open and read the file line by line
with open(datapath2, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) > 3: # skip first row
            start = int(parts[3])
            end = int(parts[4])
            
            # Check if the start and end fall within any of the specified ranges
            for start_point, end_point in zip(start_points, end_points):
                # print ((start, type(start), end_point, type(end_point)))
                if start_point <= start <= end_point or start_point <= end <= end_point:
                # if start >= start_point
                    island_ids.append(parts[8].split('=')[1].split(';')[0])
                    break  # Break the loop once a match is found

island_ids = [int(i) for i in island_ids]
print ("Done :)")
print (island_ids)

Done :)
[]


In [4]:
# get ortholog IDs of island genes

island_cycogs = cycogs_df[(cycogs_df.GenomeName == genome) & (cycogs_df.GeneID.isin(island_ids))].OrthologID.to_list()

print(island_cycogs)

[60000001, 60000002, 60000009, 60000011, 60000015, 60000015, 60000015, 60000016, 60000016, 60000018, 60000024, 60000025, 60000027, 60000028, 60000030, 60000030, 60000032, 60000034, 60000039, 60000040, 60000042, 60000045, 60000045, 60000046, 60000046, 60000046, 60000049, 60000050, 60000053, 60000057, 60000059, 60000059, 60000068, 60000075, 60000080, 60000081, 60000084, 60000085, 60000087, 60000088, 60000095, 60000100, 60000103, 60000104, 60000107, 60000110, 60000111, 60000112, 60000115, 60000119, 60000124, 60000125, 60000126, 60000127, 60000135, 60000136, 60000137, 60000142, 60000144, 60000149, 60000150, 60000155, 60000159, 60000160, 60000161, 60000163, 60000165, 60000166, 60000167, 60000180, 60000181, 60000184, 60000186, 60000187, 60000196, 60000203, 60000205, 60000210, 60000216, 60000218, 60000221, 60000226, 60000231, 60000240, 60000241, 60000245, 60000248, 60000251, 60000254, 60000261, 60000263, 60000269, 60000273, 60000275, 60000279, 60000284, 60000285, 60000290, 60000291, 60000301,

In [7]:
# look at copies of pro2 orthologs in highlighted genomic island(s)

semi_confirmed = []
pro2_genes_df = pro2_genes_df.sort_values(by=['Ortholog'])
pro2_orthologs = pro2_genes_df['Ortholog']
for ortholog in pro2_orthologs:
    if ortholog in island_cycogs:
        semi_confirmed.append(ortholog)
pro2_genes_df[pro2_genes_df.Ortholog.isin(semi_confirmed)].sort_values('Ortholog', ascending=True)

Unnamed: 0,Ortholog,MedianWeight,MeanWeight,BootstrapSupport,Annotation
96,60000015,0.001686,0.015031,0.52,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
53,60000028,0.055528,0.053814,0.913333,Protein of unknown function (DUF3764)
116,60000030,0.0,0.006368,0.366667,Protein of unknown function (DUF3303)
132,60000034,0.0,0.00615,0.256667,hypothetical protein
26,60000046,0.108792,0.10569,0.98,UDP-glucuronate 4-epimerase
156,60000053,0.0,0.007705,0.18,dolichol-phosphate mannosyltransferase
103,60000112,0.0,0.011971,0.453333,primary replicative DNA helicase
7,60000126,0.164572,0.156027,0.99,UDPglucose 6-dehydrogenase
217,60000218,0.0,0.000342,0.023333,hypothetical protein
136,60000279,0.0,0.001926,0.213333,tRNA (Guanine37-N(1)-) methyltransferase


In [8]:
# look at pro2 genes in the genome of interest

cluster_genes = pro2_genes_df[pro2_genes_df.MedianWeight.gt(0)].Ortholog.to_list()

genome_cluster_cycogs = cycogs_df[(cycogs_df.GenomeName == genome) & (cycogs_df.OrthologID.isin(cluster_genes))]

print(len(genome_cluster_cycogs.OrthologID.unique()))
genome_cluster_cycogs

32


Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation
16224,MED4_2606840673,60000015,MED4,2606840673,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
16244,MED4_2606840729,60000015,MED4,2606840729,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
16479,MED4_2606841530,60000015,MED4,2606841530,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
16644,MED4_2606840728,60000015,MED4,2606840728,"dTDP-4-amino-4,6-dideoxygalactose transaminase"
28335,MED4_2606840092,60000028,MED4,2606840092,Protein of unknown function (DUF3764)
28490,MED4_2606840904,60000028,MED4,2606840904,Protein of unknown function (DUF3764)
41151,MED4_2606840671,60000046,MED4,2606840671,UDP-glucuronate 4-epimerase
41443,MED4_2606840678,60000046,MED4,2606840678,UDP-glucuronate 4-epimerase
41555,MED4_2606840669,60000046,MED4,2606840669,UDP-glucuronate 4-epimerase
41578,MED4_2606840727,60000046,MED4,2606840727,UDP-glucuronate 4-epimerase
