This notebook uses the product of continuous_filtering.ipynb (which is now in this notebook for ease of access) to create CSVs and dataframes used for plotting in plotting_orth.ipynb.

In [15]:
import os
import numpy as np
import pandas as pd
import csv
from collections import defaultdict

In [2]:
datapath99 = ('../data/cycogs/cycogsgenomes.tsv') # basic information on each genome
datapath = ('data/0/1-s2.0-S0092867422015197-mmc2.csv') # genome island coordinates
datapath3 = ('../data/cycogs/ortholog-metadata.csv') # matches geneID to cycog number
datapath4 = ('data/0/pro2_genes.csv') # alternative dataset for putative island genes

# create dataframes from datapaths
cycogs_df = pd.read_csv(datapath3)
pro2_genes_df = pd.read_csv(datapath4)
genomes_df = pd.read_csv(datapath99, sep='\t')
gi_locations_df = pd.read_csv(datapath)

Make "dataframe 1" and friends. Concerns presence in the entire genome. If you save these dataframes to a CSV, make sure index is set to 'True'.

In [3]:
# make a dataframe quantifying copies of pro2 genes within isolate genomes

genome_ortholog_dict = {}
isolates_df = genomes_df[genomes_df.TYPE == 'ISOLATE']
isolates_list = isolates_df.IID.tolist()
cluster_genes = pro2_genes_df[pro2_genes_df.MedianWeight.gt(0)].Ortholog.to_list()

for genome in isolates_list:
    genome_cycogs = cycogs_df[(cycogs_df.GenomeName == genome)]
    ortholog_counts = genome_cycogs['OrthologID'].value_counts().to_dict()
    genome_ortholog_dict[genome] = ortholog_counts

# Step 2: Convert the dictionary of dictionaries to a DataFrame
df1 = pd.DataFrame.from_dict(genome_ortholog_dict, orient='index').fillna(0)
df1 = df1.transpose()
# The DataFrame 'df' now has orthologIDs as the index, genomes as columns, and the counts as the data points
df2 = df1[df1.index.isin(cluster_genes)]
display (df2)

Unnamed: 0,AS9601,EQPAC1,GP2,LG,MED4,MIT0601,MIT0602,MIT0603,MIT0604,MIT0701,...,SS120-1,P-SSP6,KBS-S-2A,S-CBS4,PSS2,S-CBS1,S-CBS3,P-RSP2,MED4-117,MED4-184
60000028,3.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60000046,3.0,4.0,1.0,1.0,4.0,2.0,2.0,2.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001223,2.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60000878,2.0,3.0,2.0,1.0,3.0,1.0,2.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001376,2.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,3.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60002595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60003326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60005023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60003952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# this dataframe turns all nonzero values into 1, indicating only the presence or absence of a gene in a genome/island
coreness_df1 = df10.where(df10 == 0, 1)
display (coreness_df1)

Unnamed: 0,AS9601,EQPAC1,GP2,LG,MED4,MIT0601,MIT0603,MIT0604,MIT0701,MIT0702,...,MIT9515,NATL1A,NATL2A,PAC1,SB,SS120,SS2,SS35,SS51,SS52
60000028,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60000046,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60001223,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
60000878,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60001376,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60002595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60003326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
60005023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
60003952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# single_contigs was produced from continuous_filtering.ipynb, and was downscaled from the 'matched' list to isolate continuous genomes
single_contigs = ['MIT1223', 'NATL1A', 'MED4', 'MIT1214', 'MIT0913', 'MIT0915', 'MIT9215', 'MIT0801', 'MIT9312', 'MIT1314', 'MIT1341', 'MIT0917', 'MIT1307', 'MIT0604', 'MIT9303', 'MIT9515', 'MIT1013', 'MIT0919', 'MIT0918', 'MIT9313', 'NATL2A', 'MIT9211', 'MIT1300', 'MIT0912', 'MIT9301', 'SS120', 'AS9601']
contig_df1 = df2[df2.columns[df2.columns.isin(single_contigs)]]
contig_df1.to_csv('../data/contigs1.csv', index=True)
contig_df1

Unnamed: 0,AS9601,MED4,MIT0604,MIT0801,MIT0912,MIT0913,MIT0915,MIT0917,MIT0918,MIT0919,...,MIT9211,MIT9215,MIT9301,MIT9303,MIT9312,MIT9313,MIT9515,NATL1A,NATL2A,SS120
60000028,3.0,2.0,2.0,4.0,5.0,5.0,4.0,4.0,1.0,1.0,...,1.0,3.0,3.0,2.0,1.0,1.0,1.0,6.0,3.0,1.0
60000046,3.0,4.0,3.0,3.0,1.0,1.0,2.0,2.0,2.0,2.0,...,3.0,4.0,2.0,3.0,1.0,2.0,3.0,2.0,2.0,1.0
60001223,2.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,2.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
60000878,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60001376,2.0,0.0,3.0,1.0,2.0,3.0,1.0,0.0,0.0,5.0,...,3.0,1.0,1.0,13.0,0.0,5.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60002595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
60003326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
60005023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60003952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# this dataframe is also made in plotting_orth.ipynb
transposed_contig_df = contig_df1.transpose()
transposed_contig_df

Unnamed: 0,60000028,60000046,60001223,60000878,60001376,60001259,60001471,60001893,60000015,60001176,...,60005558,60004359,60003810,60007386,60002868,60002595,60003326,60005023,60003952,60008290
AS9601,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MED4,2.0,4.0,1.0,3.0,0.0,1.0,2.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0604,2.0,3.0,1.0,1.0,3.0,2.0,1.0,1.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0801,4.0,3.0,1.0,1.0,1.0,22.0,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0912,5.0,1.0,1.0,1.0,2.0,5.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0913,5.0,1.0,1.0,1.0,3.0,4.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0915,4.0,2.0,5.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0917,4.0,2.0,1.0,1.0,0.0,45.0,0.0,0.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0918,1.0,2.0,2.0,1.0,0.0,0.0,1.0,0.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0919,1.0,2.0,1.0,1.0,5.0,1.0,0.0,0.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Indicate presence/absence of genes in the continuous isolates.
cont_core_df1 = contig_df1.where(contig_df1 == 0, 1)
display (cont_core_df1)

Unnamed: 0,AS9601,MED4,MIT0604,MIT0801,MIT0912,MIT0913,MIT0915,MIT0917,MIT0918,MIT0919,...,MIT9211,MIT9215,MIT9301,MIT9303,MIT9312,MIT9313,MIT9515,NATL1A,NATL2A,SS120
60000028,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60000046,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60001223,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60000878,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60001376,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60002595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
60003326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
60005023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60003952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Make "dataframe 2" and friends. Dataframe 2 concerns only the genes within genomic islands, which are identified for each genome in the next code block. The "friends" are made in the same way as with dataframe 1, filtering down from the initial large dataframe.

In [8]:
# make a dictionary with any geneIDs within predicted genomic islands of the isolate genomes

REFS = '../data/img_data_cycog6/'
genome_island_dict = {}
isolates_list = isolates_df.IMG_ID.tolist()
# Loop through each genome in the directory
for genome in os.listdir(REFS):
    # Skip non-directory entries if any
    if not os.path.isdir(os.path.join(REFS, genome)):
        continue
        # Path to the GFF file for the current genome
    gff_path = os.path.join(REFS, genome, f'{genome}.gff')
    
    genome = int(genome)
    if genome in isolates_list:

        # Change genome variable to be compatible with gi_locations_df
        genome = genomes_df[genomes_df.IMG_ID == genome].IID
        genome = genome.tail(1).item()
        # print (type(genome))
        
        # Filter the dataframe for the current genome
        genome_islands_df = gi_locations_df[gi_locations_df['genome_id'] == genome]
        start_points = genome_islands_df.start.to_list()
        end_points = genome_islands_df.end.to_list()
    
        # Initialize the island_ids list for the current genome
        island_ids = []
    
        # Open and read the GFF file line by line
        with open(gff_path, 'r') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) > 3:  # Ensure there are enough parts to parse start and end
                    try:
                        start = int(parts[3])
                        end = int(parts[4])
                    except ValueError:
                        continue  # Skip lines that do not have valid integers in the expected columns
                    
                    # Check if the start and end fall within any of the specified ranges
                    for start_point, end_point in zip(start_points, end_points):
                        if start_point <= start <= end_point or start_point <= end <= end_point:
                            try:
                                island_id = int(parts[8].split('=')[1].split(';')[0])
                                island_ids.append(island_id)
                            except (IndexError, ValueError):
                                continue  # Skip lines with parsing issues in the 9th column
                            break  # Break the loop once a match is found
    
        # Store the island_ids list in the dictionary
        genome_island_dict[genome] = island_ids

print ("Genomes searched:", len(genome_island_dict))

non_empty_count = sum(1 for value in genome_island_dict.values() if len(value) > 0)
print(f"Non-empty genomic islands: {non_empty_count}")

# Optionally, print the dictionary to check the results
# for genome, ids in genome_island_dict.items():
#     print(f'Genome: {genome}, Island IDs: {ids}')

Genomes searched: 173
Non-empty genomic islands: 63


In [9]:
# make a dataframe quantifying pro2 gene copies within predicted genomic islands of isolate genomes

# Initialize the dictionary to store the results
genome_ortholog_dict = {}

# Iterate through genome_island_dict
for genome, gene_ids in genome_island_dict.items():
    ortholog_counts = defaultdict(int)  # Initialize a dictionary to count ortholog IDs
    
    # Iterate through each gene ID for the current genome
    for gene_id in gene_ids:
        # Filter cycogs_df for rows where GeneID matches gene_id
        matching_rows = cycogs_df[cycogs_df.GeneID == gene_id]
        
        # Count occurrences of each OrthologID in the filtered rows
        for ortholog_id in matching_rows.OrthologID:
            ortholog_counts[ortholog_id] += 1
    
    # Store the ortholog_counts dictionary in genome_ortholog_dict under the current genome key
    genome_ortholog_dict[genome] = dict(ortholog_counts)

# Print or use genome_ortholog_dict as needed
# print(genome_ortholog_dict)

df3 = pd.DataFrame.from_dict(genome_ortholog_dict, orient='index').fillna(0)
df3 = df3.transpose()
# The DataFrame 'df' now has genomes as the index, orthologIDs as columns, and the counts as the data points

df4 = df3[df3.index.isin(cluster_genes)]
display(df4)

Unnamed: 0,MIT1205,MIT0915,MIT0801,MIT1013,MIT0918,NATL2A,MIT0912,SS120,MIT9313,MED4,...,MIT9302,MIT9322,MIT0701,MIT9107,MIT0603,MIT9311,MIT9401,GP2,MIT9314,MIT9321
60001176,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001027,2.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001499,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60000015,2.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001490,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60001512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
coreness_df2 = df4.where(df4 == 0, 1)
display (coreness_df2)

Unnamed: 0,MIT1205,MIT0915,MIT0801,MIT1013,MIT0918,NATL2A,MIT0912,SS120,MIT9313,MED4,...,MIT9302,MIT9322,MIT0701,MIT9107,MIT0603,MIT9311,MIT9401,GP2,MIT9314,MIT9321
60001176,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001027,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001499,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60000015,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001490,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60001512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
contig_df2 = df4[df4.columns[df4.columns.isin(single_contigs)]]
contig_df2.to_csv('../data/contigs2.csv', index=True)
contig_df2

Unnamed: 0,MIT0915,MIT0801,MIT1013,MIT0918,NATL2A,MIT0912,SS120,MIT9313,MED4,MIT1214,...,MIT9303,MIT1223,MIT9215,MIT9312,MIT1314,MIT1341,MIT9515,MIT1300,AS9601,MIT9301
60001176,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
60001027,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
60001499,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
60000015,0.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,3.0,2.0,...,0.0,4.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
60001490,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60001512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# this dataframe is also constructed in the plotting_orth.ipynb
transposed_contig_df2 = contig_df2.transpose()
transposed_contig_df2

Unnamed: 0,60001176,60001027,60001499,60000015,60001490,60001518,60000046,60001269,60001223,60001229,...,60003917,60003326,60004148,60004028,60002590,60001512,60002260,60002335,60002070,60003952
MIT0915,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0801,1.0,1.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT1013,0.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0918,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NATL2A,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT0912,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SS120,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT9313,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MED4,0.0,0.0,0.0,3.0,0.0,0.0,3.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIT1214,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
cont_core_df2 = contig_df2.where(contig_df2 == 0, 1)
display (cont_core_df2)

Unnamed: 0,MIT0915,MIT0801,MIT1013,MIT0918,NATL2A,MIT0912,SS120,MIT9313,MED4,MIT1214,...,MIT9303,MIT1223,MIT9215,MIT9312,MIT1314,MIT1341,MIT9515,MIT1300,AS9601,MIT9301
60001176,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
60001027,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
60001499,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
60000015,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
60001490,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60001512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# average and combine two dataframes for plotting
# this block is used to create CSVs that can be easily plotted with in the plotting_orth.ipynb

# average_overlaps represents an iteration of dataframe 1
average_overlaps = cont_core_df1.mean(axis=1)
average_overlaps = average_overlaps.sort_index()

# average_overlaps2 represents an iteration of dataframe 2
average_overlaps2 = cont_core_df2.mean(axis=1)
average_overlaps2 = average_overlaps2.sort_index()
average_overlaps2 = average_overlaps2.reindex(average_overlaps.index)

combined_df = pd.DataFrame({
    'OrthologID': average_overlaps.index,
    'Genome_copies': average_overlaps.values,
    'Island_copies': average_overlaps2.values
})
display (combined_df)
combined_df.to_csv('../data/complete_coreness.csv', index=True)

Unnamed: 0,OrthologID,Genome_copies,Island_copies
0,60000015,1.000000,0.592593
1,60000028,1.000000,0.407407
2,60000046,1.000000,0.407407
3,60000126,1.000000,0.444444
4,60000600,1.000000,0.222222
...,...,...,...
87,60005258,0.111111,
88,60005558,0.037037,
89,60007386,0.000000,0.000000
90,60008213,0.037037,0.037037
