In [1]:
def combine_fasta_files(input_files, output_file):
    with open(output_file, "w") as outfile:
        for file in input_files:
            try:
                with open(file) as infile:
                    outfile.write(infile.read())
            except FileNotFoundError:
                print(f"File not found: {file}")
    print(f"Combined nucleotide .ffn file saved to: {output_file}")


In [2]:
nucleotide_ffn_files = [
    "D:/AWAIS/Final/Pseudomonas_syringae.ffn",
    "D:/AWAIS/Final/Erwinia_amylovora.ffn",
    "D:/AWAIS/Final/Xanthomonas_oryzae.ffn",
    "D:/AWAIS/Final/Klebsiella_aerogenes.ffn",
    "D:/AWAIS/Final/Yersinia_pestis.ffn",
    "D:/AWAIS/Final/Rickettsia_conorii.ffn",
    "D:/AWAIS/Final/Mycobacterium_avium.ffn",
    "D:/AWAIS/Final/Escherichia_coli.ffn",
    "D:/AWAIS/Final/Streptococcus_pneumoniae.ffn"
]


In [3]:
combined_ffn = "D:/AWAIS/Final/Combined_nucleotide_sequences.fnn"
combine_fasta_files(nucleotide_ffn_files, combined_ffn)

Combined nucleotide .ffn file saved to: D:/AWAIS/Final/Combined_nucleotide_sequences.fnn


In [4]:
from Bio import SeqIO

def remove_duplicates(input_fasta, output_fasta):
    unique_sequences = {}
    
    for record in SeqIO.parse(input_fasta, "fasta"):
        if record.id not in unique_sequences:
            unique_sequences[record.id] = record
    
    with open(output_fasta, "w") as outfile:
        SeqIO.write(unique_sequences.values(), outfile, "fasta")
    
    print(f"Duplicates removed. Output saved to: {output_fasta}")
input_fasta = "D:/AWAIS/Final/Combined_nucleotide_sequences.fnn"
output_fasta = "D:/AWAIS/Final/Combine_no_duplicates.fnn"
remove_duplicates(input_fasta, output_fasta)


Duplicates removed. Output saved to: D:/AWAIS/Final/Combine_no_duplicates.fnn


In [9]:
import subprocess

def run_blast(query_fasta, blast_db, output_file):
    command = [
        "blastn",
        "-query", query_fasta,
        "-db", blast_db,
        "-out", output_file,
        "-outfmt", "6",
        "-evalue", "1e-5",
        "-perc_identity", "70"
    ]
    subprocess.run(command, check=True)
    print(f"BLAST results saved to: {output_file}")
query_fasta = "D:/AWAIS/New/Candidatus_Liberibacter_asiaticus.ffn" 
blast_db = "D:/AWAIS/Final/reference_db"                          
blast_output = "D:/AWAIS/Final/blast_results.txt"                   
run_blast(query_fasta, blast_db, blast_output)


BLAST results saved to: D:/AWAIS/Final/blast_results.txt


In [1]:
from Bio import SeqIO
import pandas as pd
blast_result_file = r"D:\AWAIS\Final\clas_vs_others.txt"
query_fasta_file = r"D:\AWAIS\Final\Candidatus_Liberibacter_asiaticus.ffn"
unique_genes_output_file = r"D:\AWAIS\Final\unique_genes.fnn"
summary_output_file = r"D:\AWAIS\Final\summary_report.txt"
def parse_blast_results(blast_file):
    columns = [
        "query_id", "subject_id", "perc_identity", "alignment_length",
        "mismatches", "gap_opens", "q_start", "q_end",
        "s_start", "s_end", "e_value", "bit_score"
    ]
    blast_df = pd.read_csv(blast_file, sep="\t", header=None, names=columns)
    return blast_df
def extract_unique_genes(blast_df, query_fasta, output_file):
    # Get list of matched query IDs
    matched_queries = set(blast_df["query_id"].unique())
    unique_sequences = []
    total_queries = 0
    with open(query_fasta, "r") as fasta_file:
        for record in SeqIO.parse(fasta_file, "fasta"):
            total_queries += 1
            if record.id not in matched_queries:
                unique_sequences.append(record)
    with open(output_file, "w") as output_fasta:
        SeqIO.write(unique_sequences, output_fasta, "fasta")

    return total_queries, len(unique_sequences)
def generate_summary_report(total_queries, matched_count, unique_count, output_file):
    unmatched_count = total_queries - matched_count
    with open(output_file, "w") as report:
        report.write("BLAST Results Summary Report\n")
        report.write("=============================\n")
        report.write(f"Total Query Sequences: {total_queries}\n")
        report.write(f"Matched Sequences: {matched_count}\n")
        report.write(f"Unique Sequences (No Match): {unique_count}\n")
        report.write(f"Unmatched Sequences: {unmatched_count}\n")
    print(f"Summary report saved to {output_file}")
if __name__ == "__main__":
    blast_df = parse_blast_results(blast_result_file)
    matched_count = blast_df["query_id"].nunique()  # Number of unique matches
    total_queries, unique_count = extract_unique_genes(blast_df, query_fasta_file, unique_genes_output_file)
    print(f"Unique genes saved to {unique_genes_output_file}")
    generate_summary_report(total_queries, matched_count, unique_count, summary_output_file)


Unique genes saved to D:\AWAIS\Final\unique_genes.fnn
Summary report saved to D:\AWAIS\Final\summary_report.txt
