In [1]:
#--------Note: "Some path to files are hardcoded. Upadte the path to run seamlessly"

In [2]:
#-----------------Prediction of Operon for Escherichia coli K12------------------#

In [3]:
import pandas as pd

def parse_ptt_file(ptt_file_path):
    genes = []
    with open(ptt_file_path, 'r') as file:
        lines = file.readlines()[3:]  
        for line in lines:
            parts = line.strip().split('\t')
            start, end = map(int, parts[0].split('..'))
            strand = parts[1]
            genes.append((start, end, strand))
    return genes

def predict_operons(genes):
    operons = []
    current_operon = [genes[0]]

    for i in range(1, len(genes)):
        prev_gene = current_operon[-1]
        current_gene = genes[i]

        if current_gene[2] == prev_gene[2] and current_gene[0] - prev_gene[1] < 50:
            current_operon.append(current_gene)
        else:
            if len(current_operon) > 1:  
                operons.append(current_operon)
            current_operon = [current_gene]

    if len(current_operon) > 1:
        operons.append(current_operon)

    return operons

def save_operons_to_file(operons, file_path):
    with open(file_path, 'w') as f:
        for operon in operons:
            operon_str = ', '.join([f"{gene[0]}..{gene[1]}({gene[2]})" for gene in operon])
            f.write(operon_str + '\n')
            
# Using functions to parse and predict operons for E. coli
genes_E_coli = parse_ptt_file("E_coli_K12_MG1655.ptt/E_coli_K12_MG1655.ptt")  # Update the path 
operons_E_coli = predict_operons(genes_E_coli)

output_file_path = "predicted_operons_E_coli.txt"
save_operons_to_file(operons_E_coli, output_file_path)

print(f"Total number of predicted operons for E. coli are: {len(operons_E_coli)}")
print("Predicted operons with multiple genes have been successfully saved to a text file.")


Total number of predicted operons for E. coli are: 780
Predicted operons with multiple genes have been successfully saved to a text file.


In [4]:
#-----------------Prediction of Operon for Bacillus Subtilis------------------#

In [5]:
import pandas as pd

def parse_ptt_file(ptt_file_path):
    genes = []
    with open(ptt_file_path, 'r') as file:
        lines = file.readlines()[3:]  
        for line in lines:
            parts = line.strip().split('\t')
            start, end = map(int, parts[0].split('..'))
            strand = parts[1]
            genes.append((start, end, strand))
    return genes

def predict_operons(genes):
    operons = []
    current_operon = [genes[0]]

    for i in range(1, len(genes)):
        prev_gene = current_operon[-1]
        current_gene = genes[i]

        if current_gene[2] == prev_gene[2] and current_gene[0] - prev_gene[1] < 50:
            current_operon.append(current_gene)
        else:
            if len(current_operon) > 1:  
                operons.append(current_operon)
            current_operon = [current_gene]

    if len(current_operon) > 1:
        operons.append(current_operon)

    return operons

def save_operons_to_file(operons, file_path):
    with open(file_path, 'w') as f:
        for operon in operons:
            operon_str = ', '.join([f"{gene[0]}..{gene[1]}({gene[2]})" for gene in operon])
            f.write(operon_str + '\n')
            
ptt_file_path_b_subtilis = "B_subtilis_168.ptt/B_subtilis_168.ptt"  # Update the path

# Using the functions
genes_b_subtilis = parse_ptt_file(ptt_file_path_b_subtilis)
operons_b_subtilis = predict_operons(genes_b_subtilis)

# Save the predicted operons to a file
output_file_path = "predicted_operons_B_subtilis.txt"
save_operons_to_file(operons_b_subtilis, output_file_path)

print(f"Total number of predicted operons for B_subtilis are: {len(operons_b_subtilis)}")
print("Predicted operons have been successfully saved to txt file")


Total number of predicted operons for B_subtilis are: 771
Predicted operons have been successfully saved to txt file


In [6]:
#-----------------Prediction of Operon for Halobacterium------------------#

In [7]:
import pandas as pd

def parse_ptt_file(ptt_file_path):
    genes = []
    with open(ptt_file_path, 'r') as file:
        lines = file.readlines()[3:]  
        for line in lines:
            parts = line.strip().split('\t')
            start, end = map(int, parts[0].split('..'))
            strand = parts[1]
            genes.append((start, end, strand))
    return genes

def predict_operons(genes):
    operons = []
    current_operon = [genes[0]]

    for i in range(1, len(genes)):
        prev_gene = current_operon[-1]
        current_gene = genes[i]

        
        if current_gene[2] == prev_gene[2] and current_gene[0] - prev_gene[1] < 50:
            current_operon.append(current_gene)
        else:
            if len(current_operon) > 1:  
                operons.append(current_operon)
            current_operon = [current_gene]

    if len(current_operon) > 1:
        operons.append(current_operon)

    return operons

def save_operons_to_file(operons, file_path):
    with open(file_path, 'w') as f:
        for operon in operons:
            operon_str = ', '.join([f"{gene[0]}..{gene[1]}({gene[2]})" for gene in operon])
            f.write(operon_str + '\n')
            
# Using functions to parse and predict operons for Synechocystis
genes_Halobacterium = parse_ptt_file("Halobacterium_NRC1.ptt/Halobacterium_NRC1.ptt")  # Update the path
operons_Halobacterium = predict_operons(genes_Halobacterium)

output_file_path = "predicted_operons_Halobacterium.txt"
save_operons_to_file(operons_Halobacterium, output_file_path)

print(f"Total number of predicted operons for Halobacterium are: {len(operons_Halobacterium)}")
print("Predicted operons have been successfully saved to txt file")


Total number of predicted operons for Halobacterium are: 390
Predicted operons have been successfully saved to txt file


In [8]:
#-----------------Prediction of Operon for Synechocystis------------------#

In [9]:
import pandas as pd

def parse_ptt_file(ptt_file_path):
    genes = []
    with open(ptt_file_path, 'r') as file:
        lines = file.readlines()[3:]  
        for line in lines:
            parts = line.strip().split('\t')
            start, end = map(int, parts[0].split('..'))
            strand = parts[1]
            genes.append((start, end, strand))
    return genes

def predict_operons(genes):
    operons = []
    current_operon = [genes[0]]

    for i in range(1, len(genes)):
        prev_gene = current_operon[-1]
        current_gene = genes[i]

        if current_gene[2] == prev_gene[2] and current_gene[0] - prev_gene[1] < 50:
            current_operon.append(current_gene)
        else:
            if len(current_operon) > 1:  
                operons.append(current_operon)
            current_operon = [current_gene]

    if len(current_operon) > 1:
        operons.append(current_operon)

    return operons

def save_operons_to_file(operons, file_path):
    with open(file_path, 'w') as f:
        for operon in operons:
            operon_str = ', '.join([f"{gene[0]}..{gene[1]}({gene[2]})" for gene in operon])
            f.write(operon_str + '\n')
            
# Using functions to parse and predict operons for Synechocystis
genes_synechocystis = parse_ptt_file("Synechocystis_PCC6803_uid159873.ptt/Synechocystis_PCC6803_uid159873.ptt") # Update the path
operons_synechocystis = predict_operons(genes_synechocystis)

# Save the predicted operons to a file
output_file_path = "predicted_operons_synechocystis.txt"
save_operons_to_file(operons_synechocystis, output_file_path)

print(f"Total number of predicted operons for Synechocystis are: {len(operons_synechocystis)}")
print("Predicted operons have been successfully saved to txt file")


Total number of predicted operons for Synechocystis are: 484
Predicted operons have been successfully saved to txt file


In [10]:
#-----------------Prediction of Operon for Hoatzin (IMG id 2088090036)------------------#

In [11]:
# For the Hoatzin predicting the operons based on the similar contigs that is adjacent co-directional genes 
# Having intervening distance less than 50bp distance 

import pandas as pd 

def parse_gff_for_operons(gff_file_path):
    
    operons = []
    with open(gff_file_path, 'r') as file:
        current_operon = []
        last_end = 0
        last_strand = ''
        last_contig = ''
        for line in file:
            if line.startswith('#') or not line.strip():
                continue  
            parts = line.strip().split('\t')
            contig, feature_type, start, end, strand = parts[0], parts[2], int(parts[3]), int(parts[4]), parts[6]
            # Only consider 'CDS' for operon prediction
            if feature_type.lower() == 'cds':
                # CDS co-directional with the previous and close and part of the same operon
                if contig == last_contig and strand == last_strand and start - last_end < 50:
                    current_operon.append((contig, start, end, strand))
                else:
                    if current_operon and len(current_operon) > 1:
                        operons.append(current_operon)
                    # Start a new operon list with the current CDS
                    current_operon = [(contig, start, end, strand)]
                # Update the last position and strand for the next iteration
                last_contig, last_end, last_strand = contig, end, strand
        # Add the final operon if it exists with more than one gene
        if current_operon and len(current_operon) > 1:
            operons.append(current_operon)
    return operons

def save_operons_to_file(operons, file_path):
    with open(file_path, 'w') as f:
        for operon in operons:
            # Formating each gene in the operon for output
            operon_genes = ', '.join([f"{contig}:{start}..{end}({strand})" for contig, start, end, strand in operon])
            f.write(operon_genes + '\n\n')
            

# Specify the path 
gff_file_path = 'Hoatzin.gff'      # Updatethe path to run seamlessly 

# Path to save the operon predictions
output_file_path = 'predicted_operons_hoatzin.txt'

# Parse the GFF file and predict operons
operons = parse_gff_for_operons(gff_file_path)

# Save the operons to a file
save_operons_to_file(operons, output_file_path)

# Print the total number of operons
print(f"Total number of identified operons in similar contig: {len(operons)}")
if operons:
    print("First few operons:")
    for operon in operons[:5]:
        print(', '.join([f"{contig}:{start}..{end}({strand})" for contig, start, end, strand in operon]))
else:
    print("No operons found based on the specified criteria.")

print("Predicted operons have been successfully saved to txt file")

Total number of identified operons in similar contig: 753
First few operons:
HCP21_2940:1..159(-), HCP21_2940:192..371(-)
HCP21_118_:42..788(+), HCP21_118_:793..2529(+), HCP21_118_:2540..3247(+)
HCP21_6018:201..350(+), HCP21_6018:366..584(+)
HCP21_2399:1123..1893(-), HCP21_2399:1915..2241(-)
HCP21_2957:62..496(-), HCP21_2957:477..554(-)
Predicted operons have been successfully saved to txt file
