This notebook is used to separate a list of continuous genomes from the group of isolates we are working with. This is done by reading through each of the files we are concerned with, and counting the occurrences of the '>' character, which indicates a new contig, or strand of DNA representing the genome. A continuous genome would have a single contig.
The 'matched' list names all of the genomes previously determined to be in both data sources used in this analysis. It may be subject to change.

In [1]:
import os
import numpy as np
import pandas as pd
import csv

In [2]:
REFS = '../data/img_data_cycog6/'
matched = ['AS9601', 'EQPAC1', 'GP2', 'LG', 'MED4', 'MIT0601', 'MIT0603', 'MIT0604', 'MIT0701', 'MIT0702', 'MIT0703', 'MIT0801', 'MIT0912', 'MIT0913', 'MIT0915', 'MIT0917', 'MIT0918', 'MIT0919', 'MIT1013', 'MIT1205', 'MIT1214', 'MIT1223', 'MIT1300', 'MIT1303', 'MIT1304', 'MIT1306', 'MIT1307', 'MIT1312', 'MIT1313', 'MIT1314', 'MIT1318', 'MIT1320', 'MIT1323', 'MIT1327', 'MIT1341', 'MIT1342', 'MIT1418', 'MIT9107', 'MIT9116', 'MIT9123', 'MIT9202', 'MIT9211', 'MIT9215', 'MIT9301', 'MIT9302', 'MIT9303', 'MIT9311', 'MIT9312', 'MIT9313', 'MIT9314', 'MIT9321', 'MIT9322', 'MIT9401', 'MIT9515', 'NATL1A', 'NATL2A', 'PAC1', 'SB', 'SS120', 'SS2', 'SS35', 'SS51', 'SS52']
datapath99 = ('../data/cycogs/cycogsgenomes.tsv') # basic information on each genome
genomes_df = pd.read_csv(datapath99, sep='\t')
matched_IMGs = genomes_df[genomes_df.IID.isin(matched)].IMG_ID.tolist()
matched_IMGs_str = [str(i) for i in matched_IMGs]


In [3]:
# make a dataframe matching genome ID to contig count for the matched isolates
data = []

# Iterate over the directories within REFS
for genome in os.listdir(REFS):
    genome_path = os.path.join(REFS, genome)
    
    # Skip non-directory entries if any
    if not os.path.isdir(genome_path):
        continue
    
    if genome in matched_IMGs_str:
        # Iterate over files within each genome directory
        for file_name in os.listdir(genome_path):
            # Check if the file has a .fna extension and does not end with 'genes.fna' or 'intergenic.fna'
            if file_name.endswith('.fna') and not (file_name.endswith('genes.fna') or file_name.endswith('intergenic.fna')):
                file_path = os.path.join(genome_path, file_name)
                # Open and read the .fna file
                with open(file_path, 'r') as fna_file:
                    content = fna_file.read()
                    # Count the number of occurrences of the '>' character
                    count_greater_than = content.count('>')
                    # Append the result to the data list
                    data.append([genome, count_greater_than])

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Genome', 'Count'])
display(df)

Unnamed: 0,Genome,Count
0,2681813566,45
1,2606217692,13
2,2681813568,1
3,2623620348,1
4,2623620984,6
...,...,...
58,2623620959,1
59,2606217419,20
60,2606217685,14
61,2681813571,8


In [4]:
# converts genome ID to integers, necessary for further analysis
imgs = [eval(i) for i in df.Genome.tolist()]

In [6]:
# run this block twice
# adds the genome name to the dataframe
imgs = df.Genome.tolist()
data = []
# Iterate over genomes_df rows
for index, value in genomes_df.iterrows():
    if value['IMG_ID'] in imgs:
        data.append([value['IID'], value['IMG_ID']])

# Create a DataFrame from the data list
result_df = pd.DataFrame(data, columns=['IID', 'IMG_ID'])
df['Genome'] = df['Genome'].astype(int)

# Merge result_df with df
merged_df = df.merge(result_df, left_on='Genome', right_on='IMG_ID')

merged_df = merged_df.drop(['Genome'], axis=1)[['IID', 'IMG_ID', 'Count']]
merged_df

Unnamed: 0,IID,IMG_ID,Count
0,MIT1205,2681813566,45
1,MIT9107,2606217692,13
2,MIT1223,2681813568,1
3,NATL1A,2623620348,1
4,MIT9202,2623620984,6
...,...,...,...
58,AS9601,2623620959,1
59,PAC1,2606217419,20
60,LG,2606217685,14
61,MIT1304,2681813571,8


In [8]:
# list to be copied to ortholog_census.ipynb to isolate continuous genomes
single_contig_df = merged_df[merged_df.Count == 1]
single_contigs = single_contig_df.IID.tolist()
print (single_contigs)

['MIT1223', 'NATL1A', 'MED4', 'MIT1214', 'MIT0913', 'MIT0915', 'MIT9215', 'MIT0801', 'MIT9312', 'MIT1314', 'MIT1341', 'MIT0917', 'MIT1307', 'MIT0604', 'MIT9303', 'MIT9515', 'MIT1013', 'MIT0919', 'MIT0918', 'MIT9313', 'NATL2A', 'MIT9211', 'MIT1300', 'MIT0912', 'MIT9301', 'SS120', 'AS9601']
