The purpose of this notebook is to assess for each CyCOG what proportion of genes fall within genomic islands, as defined by [Hackl et al. (2023)](https://doi.org/10.1016/j.cell.2022.12.006)


In [1]:
import os
import pandas as pd

In [2]:
# import data

genome_df = pd.read_csv('../data/metadata/genome-metadata.csv') # basic information on each genome
cycogs_df = pd.read_csv('../data/metadata/ortholog-metadata.csv') # matches geneID to cycog number
gi_locations_df = pd.read_excel('../data/hackl-2023/mmc2.xlsx')
pro2_genes_df = pd.read_excel('../data/clusters/Data_S1-Pro-Clusters.xlsx', sheet_name='pro2-gene-weights')

gi_locations_df

Unnamed: 0,genome_id,contig_id,start,end
0,AG-311-I09,AG-311-I09_c,278697,291980
1,AG-311-I09,AG-311-I09_c,357521,402907
2,AG-311-I09,AG-311-I09_c,658277,702729
3,AG-311-I09,AG-311-I09_c,765104,871496
4,AG-311-I09,AG-311-I09_c,1088878,1101236
...,...,...,...,...
5593,TMED223,TMED223_c,96553,152800
5594,TMED223,TMED223_c,1654905,1703279
5595,TMED223,TMED223_c,2045478,2080212
5596,TMED223,TMED223_c,2125181,2155920


In [6]:
# select only closed complete genomes to analyze first

complete_genomes_df = genome_df[
    (genome_df['CyCOGGenome']) & 
    (genome_df['GenomicIslands']) & 
    (genome_df['NContigsCyCOG'] == 1) & 
    (genome_df['NContigsHackl'] == 1)
]
complete_genomes_df


Unnamed: 0,GenomeID,GenomeName,Type,CladeCyCOG,CompletenessCyCOG,CyCOGGenome,NContigsCyCOG,CladeHackl,CompletenessHackl,GenomicIslands,NContigsHackl
118,2623620345,MIT9515,ISOLATE,HLI,100.0,True,1,HLI,100.0,True,1
120,2606217259,MED4,ISOLATE,HLI,99.46,True,1,HLI,100.0,True,1
329,2681813573,MIT1314,ISOLATE,HLII,100.0,True,1,HLII,100.0,True,1
332,2606217688,MIT0604,ISOLATE,HLII,99.73,True,1,HLII,99.85,True,1
333,2606217559,MIT9215,ISOLATE,HLII,99.73,True,1,HLII,100.0,True,1
335,2606217708,MIT9312,ISOLATE,HLII,99.73,True,1,HLII,100.0,True,1
340,2623620959,AS9601,ISOLATE,HLII,99.64,True,1,HLII,99.89,True,1
343,2623620961,MIT9301,ISOLATE,HLII,99.46,True,1,HLII,100.0,True,1
470,2623620348,NATL1A,ISOLATE,LLI,99.73,True,1,LLI,100.0,True,1
471,2606217240,NATL2A,ISOLATE,LLI,99.45,True,1,LLI,100.0,True,1


In [12]:
# make a dataframe quantifying copies of CyCOGs within selected isolate genomes

gi_stats_df = pd.DataFrame()
for _, row in complete_genomes_df.iterrows():
    # cycog_counts_df = cycogs_df[cycogs_df.GenomeName == row.GenomeName]['OrthologID'].value_counts().reset_index()
    cycog_counts_df = cycogs_df[cycogs_df.GenomeName == row.GenomeName]['OrthologID'].value_counts().reset_index()
    cycog_counts_df['GenomeID'] = row.GenomeID
    cycog_counts_df['GenomeName'] = row.GenomeName
    if len(gi_stats_df) == 0:
        gi_stats_df = cycog_counts_df
    else:
        gi_stats_df = pd.concat([gi_stats_df, cycog_counts_df])
# clean up compiled DataFrame
gi_stats_df = gi_stats_df.rename(columns={'count': 'GeneCopies', 'OrthologID': 'CyCOGID'}).reset_index(drop=True)
gi_stats_df = gi_stats_df[['GenomeID', 'GenomeName', 'CyCOGID', 'GeneCopies']]
gi_stats_df


Unnamed: 0,GenomeID,GenomeName,CyCOGID,GeneCopies
0,2623620345,MIT9515,60000015,4
1,2623620345,MIT9515,60000027,4
2,2623620345,MIT9515,60001426,4
3,2623620345,MIT9515,60000252,4
4,2623620345,MIT9515,60000059,4
...,...,...,...,...
50019,2681813568,MIT1223,60000632,1
50020,2681813568,MIT1223,60000631,1
50021,2681813568,MIT1223,60000630,1
50022,2681813568,MIT1223,60000629,1


In [27]:
cycogs_df

Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit
...,...,...,...,...,...
964917,AG-363-C02_2667889608,60040295,AG-363-C02,2667889608,hypothetical protein
964918,AG-363-C02_2667889615,60040295,AG-363-C02,2667889615,hypothetical protein
964919,AG-363-C02_2667890048,60040295,AG-363-C02,2667890048,hypothetical protein
964920,AG-363-C02_2667890054,60040295,AG-363-C02,2667890054,hypothetical protein


In [30]:
# parse gff file

gff_df = pd.read_csv('../data/img_data_cycog6/2623620345/2623620345.gff', sep='\t', skiprows=1, header=None, 
                     names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
gff_df['GeneID'] = gff_df.attributes.str.slice(start=3, stop=13).astype(int)

gff_df

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,GeneID
0,Ga0067215_11,img_core_v400,CDS,81,1472,.,+,0,ID=2624147302;locus_tag=Ga0067215_111;product=...,2624147302
1,Ga0067215_11,img_core_v400,CDS,1467,2705,.,-,0,ID=2624147303;locus_tag=Ga0067215_112;product=...,2624147303
2,Ga0067215_11,img_core_v400,CDS,2758,4122,.,+,0,ID=2624147304;locus_tag=Ga0067215_113;product=...,2624147304
3,Ga0067215_11,img_core_v400,CDS,4128,5207,.,-,0,ID=2624147305;locus_tag=Ga0067215_114;product=...,2624147305
4,Ga0067215_11,img_core_v400,CDS,5315,6364,.,+,0,ID=2624147306;locus_tag=Ga0067215_115;product=...,2624147306
...,...,...,...,...,...,...,...,...,...,...
1944,Ga0067215_11,img_core_v400,CDS,1699778,1700416,.,+,0,ID=2624149246;locus_tag=Ga0067215_111947;produ...,2624149246
1945,Ga0067215_11,img_core_v400,CDS,1700430,1702226,.,+,0,ID=2624149247;locus_tag=Ga0067215_111948;produ...,2624149247
1946,Ga0067215_11,img_core_v400,CDS,1702226,1702756,.,+,0,ID=2624149248;locus_tag=Ga0067215_111949;produ...,2624149248
1947,Ga0067215_11,img_core_v400,CDS,1702753,1703442,.,-,0,ID=2624149249;locus_tag=Ga0067215_111950;produ...,2624149249


In [38]:
gi_locations_df.genome_id.nunique()

623