diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..441e291 --- /dev/null +++ b/.gitignore @@ -0,0 +1,41 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Test outputs +tests/test_output/ diff --git a/cdsselector.py b/cdsselector.py index 8341434..98e5042 100644 --- a/cdsselector.py +++ b/cdsselector.py @@ -1,58 +1,237 @@ -# -*- coding:utf-8 -*- -""" - Propouse: - This script removes CDS that arent on the allowed list, - with annoted GBK files you can create a GBK that only - contains CDS that are ´allowed´ by the list. - - Usage: - python cdsselector.py $GBK_FOLDER $OUTDIR #LIST_FILE +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +CDS Selector - $GBK_FOLDER = Folder containing all gbk files that will be used as input +This script filters CDS (Coding Sequences) from annotated GenBank files, +keeping only those that are in an allowed gene list. The output is a new +GenBank file containing only the selected features. - $OUTDIR = Name of the output dir +Usage: + python cdsselector.py --input-folder --genes-list + --output-folder - $LIST_FILE = A file containt a gene on every line following this example: - dnaA - dnaK - rpoB +Arguments: + --input-folder : Folder containing GenBank (.gbk) files to process + --genes-list : File containing one gene name per line + --output-folder : Folder where filtered GenBank files will be saved - OBS: the list file shouldn't contain and special character. +Author: Davi J. Marcon +Email: davijosuemarcon@gmail.com """ -from Bio import SeqIO -import sys + +import argparse +import logging import os +import sys +from Bio import SeqIO + + +def setup_logging(): + """Configure logging for the script.""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Filter CDS features from GenBank files based on a gene list.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python cdsselector.py --input-folder gbk_files --genes-list genes.txt + --output-folder filtered_gbk + python cdsselector.py -i input_gbk -g allowed_genes.txt -o output_gbk + """ + ) + + parser.add_argument( + '--input-folder', '-i', + required=True, + metavar='DIR', + help='Input folder containing GenBank files' + ) + + parser.add_argument( + '--genes-list', '-g', + required=True, + metavar='FILE', + help='File containing gene names (one per line)' + ) + + parser.add_argument( + '--output-folder', '-o', + required=True, + metavar='DIR', + help='Output folder for filtered GenBank files' + ) + + return parser.parse_args() + + +def load_gene_list(genes_file): + """ + Load gene names from a file. + + Args: + genes_file (str): Path to the file containing gene names + + Returns: + list: List of gene names + + Raises: + FileNotFoundError: If the gene list file does not exist + IOError: If there is an error reading the file + """ + if not os.path.exists(genes_file): + raise FileNotFoundError(f"Gene list file not found: {genes_file}") + + try: + with open(genes_file, 'r') as f: + gene_list = [line.strip() for line in f if line.strip()] + logging.info(f"Loaded {len(gene_list)} genes from {genes_file}") + return gene_list + except IOError as e: + raise IOError(f"Error reading gene list file: {e}") + + +def validate_input_folder(input_folder): + """ + Validate that the input folder exists and contains GenBank files. + + Args: + input_folder (str): Path to the input folder + + Returns: + list: List of GenBank files in the folder + + Raises: + FileNotFoundError: If the folder does not exist + ValueError: If no GenBank files are found + """ + if not os.path.exists(input_folder): + raise FileNotFoundError(f"Input folder not found: {input_folder}") + + if not os.path.isdir(input_folder): + raise ValueError(f"Input path is not a directory: {input_folder}") + + files = [f for f in os.listdir(input_folder) + if f.endswith(('.gbk', '.gb', '.genbank'))] + + if not files: + raise ValueError(f"No GenBank files found in {input_folder}") + + logging.info(f"Found {len(files)} GenBank file(s) in {input_folder}") + return files + + +def filter_features(seq_record, gene_list): + """ + Filter features from a sequence record based on the gene list. + + Args: + seq_record (SeqRecord): BioPython SeqRecord object + gene_list (list): List of allowed gene names + + Returns: + SeqRecord: SeqRecord with filtered features + """ + filtered_features = [] + + for feature in seq_record.features: + # Keep source features with genomic DNA mol_type + if feature.type == "source" and "mol_type" in feature.qualifiers: + if feature.qualifiers["mol_type"][0] == 'genomic DNA': + filtered_features.append(feature) + + # Keep CDS and gene features that are in the gene list + elif feature.type in ["CDS", "gene"] and "gene" in feature.qualifiers: + gene_name = feature.qualifiers['gene'][0] + if gene_name in gene_list: + filtered_features.append(feature) + + seq_record.features = filtered_features + return seq_record + + +def process_genbank_files(input_folder, output_folder, gene_list, input_files): + """ + Process all GenBank files in the input folder. + + Args: + input_folder (str): Input folder path + output_folder (str): Output folder path + gene_list (list): List of allowed gene names + input_files (list): List of GenBank files to process + """ + processed_count = 0 + error_count = 0 + + for input_file in input_files: + input_path = os.path.join(input_folder, input_file) + output_path = os.path.join(output_folder, input_file) + + try: + logging.info(f"Processing: {input_file}") + + # Parse and filter each sequence record + filtered_records = [] + for seq_record in SeqIO.parse(input_path, 'genbank'): + filtered_record = filter_features(seq_record, gene_list) + filtered_records.append(filtered_record) + + # Write filtered records to output file + SeqIO.write(filtered_records, output_path, 'genbank') + processed_count += 1 + logging.info(f"Successfully processed: {input_file}") + + except Exception as e: + error_count += 1 + logging.error(f"Error processing {input_file}: {e}") + + msg = f"Processing complete: {processed_count} succeeded, {error_count} failed" + logging.info(msg) + + +def main(): + """Main function to orchestrate the CDS selection process.""" + setup_logging() + + try: + # Parse command line arguments + args = parse_arguments() + + logging.info("CDS Selector started") + logging.info(f"Input folder: {args.input_folder}") + logging.info(f"Genes list: {args.genes_list}") + logging.info(f"Output folder: {args.output_folder}") + + # Load gene list + gene_list = load_gene_list(args.genes_list) + + # Validate input folder and get list of files + input_files = validate_input_folder(args.input_folder) + + # Create output folder if it doesn't exist + if not os.path.exists(args.output_folder): + os.makedirs(args.output_folder) + logging.info(f"Created output folder: {args.output_folder}") + + # Process all GenBank files + process_genbank_files(args.input_folder, args.output_folder, + gene_list, input_files) + + logging.info("CDS Selector finished successfully") + return 0 + + except Exception as e: + logging.error(f"Fatal error: {e}") + return 1 + -#I/O declaration - -input_folder = sys.argv[1] -output_folder = sys.argv[2] -gene_list = [line.strip('\n') for line in open(sys.argv[3])] - -#Taking permission for parsing later -for i in os.listdir(input_folder): - open(input_folder+"/"+i) - -#Check output folder existance -if not os.path.exists(output_folder): - os.makedirs(output_folder) - -#Code -for input_file in os.listdir(input_folder): - FEATURES = [] - for seq_record in SeqIO.parse(input_folder+"/"+input_file , 'genbank'):# Parse all files and interact around them - for feature in seq_record.features: # Check each feature and see if it is on gene_list - if feature.type == "CDS" and "gene" in feature.qualifiers: - gene = feature.qualifiers['gene'][0] - if gene in gene_list: - FEATURES.append(feature) - if feature.type == "gene" and "gene" in feature.qualifiers: - gene = feature.qualifiers['gene'][0] - if gene in gene_list: - FEATURES.append(feature) - if feature.type == "source" and "mol_type" in feature.qualifiers: - if feature.qualifiers["mol_type"][0] == 'genomic DNA': - FEATURES.append(feature)# If the feature is on the gene_list append them to the new feature list - seq_record.features = FEATURES - print("Creating input for: "+input_file) - SeqIO.write (seq_record, output_folder+"/"+input_file, 'genbank') +if __name__ == "__main__": + sys.exit(main()) diff --git a/take genes into aminoacid.py b/take genes into aminoacid.py index bd206cc..71edb61 100644 --- a/take genes into aminoacid.py +++ b/take genes into aminoacid.py @@ -1,32 +1,241 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Gene to Amino Acid Extractor -from Bio import SeqIO -import sys +This script extracts CDS (Coding Sequences) from GenBank files and outputs +the translated amino acid sequences in FASTA format. Genes are filtered +based on a provided list, and sequences are organized by gene name. + +Usage: + python "take genes into aminoacid.py" --input-folder + --genes-list + --output-folder + +Arguments: + --input-folder : Folder containing GenBank (.gbk) files to process + --genes-list : File containing one gene name per line + --output-folder : Folder where amino acid FASTA files will be saved + +Author: Davi J. Marcon +Email: davijosuemarcon@gmail.com +""" + +import argparse +import logging import os -#Determinando os inputs e outputs -entrada = sys.argv[1] -saida = sys.argv[2] -Lista = ["recA","rpoB"] -#abrindo os inputs -for i in os.listdir(entrada): - open(entrada+"/"+i) -#conta o numero de gbs -list = os.listdir(entrada) -number_files = len(list) -print("Foram encontrados" , (number_files), "arquivos") -#cria a pasta de outputs -if not os.path.exists(saida): - os.makedirs(saida) -#Executando código -for i in os.listdir(entrada): - for seq_record in SeqIO.parse(entrada+"/"+i , 'genbank'): - for feature in seq_record.features: - if feature.type == "CDS" and "gene" in feature.qualifiers: - gene = feature.qualifiers['gene'][0] - if gene in Lista: - if not os.path.exists(saida+"/"+gene): - os.makedirs(saida+"/"+gene) - with open(saida+"/"+gene+"/"+seq_record.id+".fasta", "w") as ofile: - ofile.write(">{0}\n{1}\n".format(seq_record.id, feature.qualifiers['translation'][0])) +import sys +from Bio import SeqIO + + +def setup_logging(): + """Configure logging for the script.""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Extract amino acid sequences from GenBank files.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python "take genes into aminoacid.py" --input-folder gbk_files + --genes-list genes.txt + --output-folder amino_acids + python "take genes into aminoacid.py" -i gbk_files -g genes.txt -o aa_out + """ + ) + + parser.add_argument( + '--input-folder', '-i', + required=True, + metavar='DIR', + help='Input folder containing GenBank files' + ) + + parser.add_argument( + '--genes-list', '-g', + required=True, + metavar='FILE', + help='File containing gene names (one per line)' + ) + + parser.add_argument( + '--output-folder', '-o', + required=True, + metavar='DIR', + help='Output folder for amino acid FASTA files' + ) + + return parser.parse_args() + + +def load_gene_list(genes_file): + """ + Load gene names from a file. + + Args: + genes_file (str): Path to the file containing gene names + + Returns: + list: List of gene names + + Raises: + FileNotFoundError: If the gene list file does not exist + IOError: If there is an error reading the file + """ + if not os.path.exists(genes_file): + raise FileNotFoundError(f"Gene list file not found: {genes_file}") + + try: + with open(genes_file, 'r') as f: + gene_list = [line.strip() for line in f if line.strip()] + logging.info(f"Loaded {len(gene_list)} genes from {genes_file}") + return gene_list + except IOError as e: + raise IOError(f"Error reading gene list file: {e}") + + +def validate_input_folder(input_folder): + """ + Validate that the input folder exists and contains GenBank files. + + Args: + input_folder (str): Path to the input folder + + Returns: + list: List of GenBank files in the folder + + Raises: + FileNotFoundError: If the folder does not exist + ValueError: If no GenBank files are found + """ + if not os.path.exists(input_folder): + raise FileNotFoundError(f"Input folder not found: {input_folder}") + + if not os.path.isdir(input_folder): + raise ValueError(f"Input path is not a directory: {input_folder}") + + files = [f for f in os.listdir(input_folder) + if f.endswith(('.gbk', '.gb', '.genbank'))] + + if not files: + raise ValueError(f"No GenBank files found in {input_folder}") + + logging.info(f"Found {len(files)} GenBank file(s) in {input_folder}") + return files + + +def process_genbank_files(input_folder, output_folder, gene_list, input_files): + """ + Process all GenBank files and extract amino acid sequences. + + Args: + input_folder (str): Input folder path + output_folder (str): Output folder path + gene_list (list): List of gene names to extract + input_files (list): List of GenBank files to process + """ + processed_count = 0 + error_count = 0 + extracted_count = 0 + + for idx, input_file in enumerate(input_files, 1): + input_path = os.path.join(input_folder, input_file) + + try: + logging.info(f"Processing file {idx}/{len(input_files)}: {input_file}") + + for seq_record in SeqIO.parse(input_path, 'genbank'): + for feature in seq_record.features: + if feature.type == "CDS" and "gene" in feature.qualifiers: + gene = feature.qualifiers['gene'][0] + if gene in gene_list: + # Check if translation exists + if 'translation' not in feature.qualifiers: + logging.warning( + f"No translation for gene {gene} in " + f"{input_file}" + ) + continue + + # Create gene-specific output folder + gene_folder = os.path.join(output_folder, gene) + if not os.path.exists(gene_folder): + os.makedirs(gene_folder) + logging.debug(f"Created folder: {gene_folder}") + + # Write amino acid sequence + output_file = os.path.join( + gene_folder, + f"{seq_record.id}.fasta" + ) + with open(output_file, "w") as ofile: + translation = feature.qualifiers['translation'][0] + ofile.write(f">{seq_record.id}\n{translation}\n") + + extracted_count += 1 + + processed_count += 1 + + except Exception as e: + error_count += 1 + logging.error(f"Error processing {input_file}: {e}") + + logging.info( + f"Processing complete: {processed_count} files succeeded, " + f"{error_count} files failed" + ) + logging.info(f"Extracted {extracted_count} amino acid sequences") + + +def main(): + """Main function to orchestrate the amino acid extraction process.""" + setup_logging() + + try: + # Parse command line arguments + args = parse_arguments() + + logging.info("Gene to Amino Acid Extractor started") + logging.info(f"Input folder: {args.input_folder}") + logging.info(f"Genes list: {args.genes_list}") + logging.info(f"Output folder: {args.output_folder}") + + # Load gene list + gene_list = load_gene_list(args.genes_list) + + # Validate input folder and get list of files + input_files = validate_input_folder(args.input_folder) + + # Create output folder if it doesn't exist + if not os.path.exists(args.output_folder): + os.makedirs(args.output_folder) + logging.info(f"Created output folder: {args.output_folder}") + + # Process all GenBank files + process_genbank_files( + args.input_folder, + args.output_folder, + gene_list, + input_files + ) + + logging.info("Gene to Amino Acid Extractor finished successfully") + return 0 + + except Exception as e: + logging.error(f"Fatal error: {e}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/take genes into nucleotides.py b/take genes into nucleotides.py index 458b339..7da96ae 100644 --- a/take genes into nucleotides.py +++ b/take genes into nucleotides.py @@ -1,32 +1,242 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Gene to Nucleotide Extractor -from Bio import SeqIO -import sys +This script extracts CDS (Coding Sequences) from GenBank files and outputs +the nucleotide sequences in FASTA format. Genes are filtered based on a +provided list, and sequences are organized by gene name. + +Usage: + python "take genes into nucleotides.py" --input-folder + --genes-list + --output-folder + +Arguments: + --input-folder : Folder containing GenBank (.gbk) files to process + --genes-list : File containing one gene name per line + --output-folder : Folder where nucleotide FASTA files will be saved + +Author: Davi J. Marcon +Email: davijosuemarcon@gmail.com +""" + +import argparse +import logging import os -#Determinando os inputs e outputs -entrada = sys.argv[1] -saida = sys.argv[2] -Lista = ["recA","rpoB"] -#abrindo os inputs -for i in os.listdir(entrada): - open(entrada+"/"+i) -#conta o numero de gbs -list = os.listdir(entrada) -number_files = len(list) -print("Foram encontrados" , (number_files), "arquivos") -#cria a pasta de outputs -if not os.path.exists(saida): - os.makedirs(saida) -#Executando código -for i in os.listdir(entrada): - for seq_record in SeqIO.parse(entrada+"/"+i , 'genbank'): - for feature in seq_record.features: - if feature.type == "CDS" and "gene" in feature.qualifiers: - gene = feature.qualifiers['gene'][0] - if gene in Lista: - if not os.path.exists(saida+"/"+gene): - os.makedirs(saida+"/"+gene) - with open(saida+"/"+gene+"/"+seq_record.id+".fasta", "w") as ofile: - ofile.write(">{0}\n{1}\n".format(seq_record.id, feature.location.extract(seq_record).seq)) +import sys +from Bio import SeqIO + + +def setup_logging(): + """Configure logging for the script.""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Extract nucleotide sequences from GenBank files.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python "take genes into nucleotides.py" --input-folder gbk_files + --genes-list genes.txt + --output-folder nucleotides + python "take genes into nucleotides.py" -i gbk_files -g genes.txt -o nuc_out + """ + ) + + parser.add_argument( + '--input-folder', '-i', + required=True, + metavar='DIR', + help='Input folder containing GenBank files' + ) + + parser.add_argument( + '--genes-list', '-g', + required=True, + metavar='FILE', + help='File containing gene names (one per line)' + ) + + parser.add_argument( + '--output-folder', '-o', + required=True, + metavar='DIR', + help='Output folder for nucleotide FASTA files' + ) + + return parser.parse_args() + + +def load_gene_list(genes_file): + """ + Load gene names from a file. + + Args: + genes_file (str): Path to the file containing gene names + + Returns: + list: List of gene names + + Raises: + FileNotFoundError: If the gene list file does not exist + IOError: If there is an error reading the file + """ + if not os.path.exists(genes_file): + raise FileNotFoundError(f"Gene list file not found: {genes_file}") + + try: + with open(genes_file, 'r') as f: + gene_list = [line.strip() for line in f if line.strip()] + logging.info(f"Loaded {len(gene_list)} genes from {genes_file}") + return gene_list + except IOError as e: + raise IOError(f"Error reading gene list file: {e}") + + +def validate_input_folder(input_folder): + """ + Validate that the input folder exists and contains GenBank files. + + Args: + input_folder (str): Path to the input folder + + Returns: + list: List of GenBank files in the folder + + Raises: + FileNotFoundError: If the folder does not exist + ValueError: If no GenBank files are found + """ + if not os.path.exists(input_folder): + raise FileNotFoundError(f"Input folder not found: {input_folder}") + + if not os.path.isdir(input_folder): + raise ValueError(f"Input path is not a directory: {input_folder}") + + files = [f for f in os.listdir(input_folder) + if f.endswith(('.gbk', '.gb', '.genbank'))] + + if not files: + raise ValueError(f"No GenBank files found in {input_folder}") + + logging.info(f"Found {len(files)} GenBank file(s) in {input_folder}") + return files + + +def process_genbank_files(input_folder, output_folder, gene_list, input_files): + """ + Process all GenBank files and extract nucleotide sequences. + + Args: + input_folder (str): Input folder path + output_folder (str): Output folder path + gene_list (list): List of gene names to extract + input_files (list): List of GenBank files to process + """ + processed_count = 0 + error_count = 0 + extracted_count = 0 + + for idx, input_file in enumerate(input_files, 1): + input_path = os.path.join(input_folder, input_file) + + try: + logging.info(f"Processing file {idx}/{len(input_files)}: {input_file}") + + for seq_record in SeqIO.parse(input_path, 'genbank'): + for feature in seq_record.features: + if feature.type == "CDS" and "gene" in feature.qualifiers: + gene = feature.qualifiers['gene'][0] + if gene in gene_list: + # Extract nucleotide sequence + try: + nuc_seq = feature.location.extract(seq_record).seq + except Exception as e: + logging.warning( + f"Could not extract sequence for gene {gene} " + f"in {input_file}: {e}" + ) + continue + + # Create gene-specific output folder + gene_folder = os.path.join(output_folder, gene) + if not os.path.exists(gene_folder): + os.makedirs(gene_folder) + logging.debug(f"Created folder: {gene_folder}") + + # Write nucleotide sequence + output_file = os.path.join( + gene_folder, + f"{seq_record.id}.fasta" + ) + with open(output_file, "w") as ofile: + ofile.write(f">{seq_record.id}\n{nuc_seq}\n") + + extracted_count += 1 + + processed_count += 1 + + except Exception as e: + error_count += 1 + logging.error(f"Error processing {input_file}: {e}") + + logging.info( + f"Processing complete: {processed_count} files succeeded, " + f"{error_count} files failed" + ) + logging.info(f"Extracted {extracted_count} nucleotide sequences") + + +def main(): + """Main function to orchestrate the nucleotide extraction process.""" + setup_logging() + + try: + # Parse command line arguments + args = parse_arguments() + + logging.info("Gene to Nucleotide Extractor started") + logging.info(f"Input folder: {args.input_folder}") + logging.info(f"Genes list: {args.genes_list}") + logging.info(f"Output folder: {args.output_folder}") + + # Load gene list + gene_list = load_gene_list(args.genes_list) + + # Validate input folder and get list of files + input_files = validate_input_folder(args.input_folder) + + # Create output folder if it doesn't exist + if not os.path.exists(args.output_folder): + os.makedirs(args.output_folder) + logging.info(f"Created output folder: {args.output_folder}") + + # Process all GenBank files + process_genbank_files( + args.input_folder, + args.output_folder, + gene_list, + input_files + ) + + logging.info("Gene to Nucleotide Extractor finished successfully") + return 0 + + except Exception as e: + logging.error(f"Fatal error: {e}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..19a85b2 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,121 @@ +# Test Suite for Bioinfo Python Scripts + +This folder contains test data and test scripts to verify the functionality of the Python scripts in this repository. + +## Structure + +``` +tests/ +├── test_data/ +│ ├── input/ # Test GenBank files +│ │ ├── test_genome1.gbk +│ │ └── test_genome2.gbk +│ └── genes.txt # Test gene list +├── test_output/ # Output from test runs (generated) +├── test_scripts.py # Main test suite +└── README.md # This file +``` + +## Test Data + +The test data includes: +- **test_genome1.gbk**: Contains genes dnaA, rpoB, and a test gene +- **test_genome2.gbk**: Contains genes recA and dnaK +- **genes.txt**: List of genes to filter/extract (dnaA, dnaK, rpoB, recA) + +## Running Tests + +### Run All Tests + +To run the complete test suite: + +```bash +python tests/test_scripts.py +``` + +This will test: +1. `cdsselector.py` - Filters GenBank files by gene list +2. `take genes into aminoacid.py` - Extracts amino acid sequences +3. `take genes into nucleotides.py` - Extracts nucleotide sequences + +### Manual Testing + +You can also test individual scripts manually: + +**Test cdsselector.py:** +```bash +python cdsselector.py --input-folder tests/test_data/input \ + --genes-list tests/test_data/genes.txt \ + --output-folder tests/test_output/cdsselector +``` + +**Test take genes into aminoacid.py:** +```bash +python "take genes into aminoacid.py" --input-folder tests/test_data/input \ + --genes-list tests/test_data/genes.txt \ + --output-folder tests/test_output/aminoacid +``` + +**Test take genes into nucleotides.py:** +```bash +python "take genes into nucleotides.py" --input-folder tests/test_data/input \ + --genes-list tests/test_data/genes.txt \ + --output-folder tests/test_output/nucleotide +``` + +## Expected Results + +### cdsselector.py +- Creates filtered GenBank files with only the genes in the list +- Output files: `test_genome1.gbk`, `test_genome2.gbk` +- Each file contains only the genes specified in genes.txt + +### take genes into aminoacid.py +- Creates folders for each gene +- Each folder contains FASTA files with amino acid sequences +- Expected output structure: + ``` + aminoacid/ + ├── dnaA/ + │ └── TEST001.fasta + ├── rpoB/ + │ └── TEST001.fasta + ├── recA/ + │ └── TEST002.fasta + └── dnaK/ + └── TEST002.fasta + ``` + +### take genes into nucleotides.py +- Creates folders for each gene +- Each folder contains FASTA files with nucleotide sequences +- Expected output structure: + ``` + nucleotide/ + ├── dnaA/ + │ └── TEST001.fasta + ├── rpoB/ + │ └── TEST001.fasta + ├── recA/ + │ └── TEST002.fasta + └── dnaK/ + └── TEST002.fasta + ``` + +## Adding New Tests + +To add new test cases: + +1. Add new GenBank files to `test_data/input/` +2. Update `genes.txt` if testing new genes +3. Modify `test_scripts.py` to include new test cases + +## Requirements + +- Python 3.6+ +- Biopython + +Install requirements: +```bash +pip install biopython +``` diff --git a/tests/test_data/genes.txt b/tests/test_data/genes.txt new file mode 100644 index 0000000..8b5ba30 --- /dev/null +++ b/tests/test_data/genes.txt @@ -0,0 +1,4 @@ +dnaA +dnaK +rpoB +recA diff --git a/tests/test_data/input/test_genome1.gbk b/tests/test_data/input/test_genome1.gbk new file mode 100644 index 0000000..884b881 --- /dev/null +++ b/tests/test_data/input/test_genome1.gbk @@ -0,0 +1,38 @@ +LOCUS TEST_GENOME1 300 bp DNA linear BCT 01-JAN-2024 +DEFINITION Test genome 1 for script testing. +ACCESSION TEST001 +VERSION TEST001.1 +KEYWORDS . +SOURCE Test organism 1 + ORGANISM Test organism 1 + Bacteria. +FEATURES Location/Qualifiers + source 1..300 + /organism="Test organism 1" + /mol_type="genomic DNA" + gene 10..72 + /gene="dnaA" + CDS 10..72 + /gene="dnaA" + /product="chromosomal replication initiator protein DnaA" + /translation="MKLVRVLSTAAAA" + gene 100..162 + /gene="rpoB" + CDS 100..162 + /gene="rpoB" + /product="DNA-directed RNA polymerase subunit beta" + /translation="MKLIVKASTGPAT" + gene 200..262 + /gene="test" + CDS 200..262 + /gene="test" + /product="test protein" + /translation="MTEST*PROTEIN" +ORIGIN + 1 atgaaacttg tacgcgtact ttcaacagca gctgctgctt agctagctag ctagctagct + 61 agctagctag ctagctagct agctagctag ctagctagat gaagttgatt gtaaaagcat + 121 caactggtcc tgcgacttag ctagctagct agctagctag ctagctagct agctagctag + 181 ctagctagct agctagctaa tgactgaaag tacaacccat ccgcgaacag ctagctagct + 241 agctagctag ctagctagct agctagctag ctagctagct agctagctag ctagctagct + 301 +// diff --git a/tests/test_data/input/test_genome2.gbk b/tests/test_data/input/test_genome2.gbk new file mode 100644 index 0000000..d5c05ce --- /dev/null +++ b/tests/test_data/input/test_genome2.gbk @@ -0,0 +1,32 @@ +LOCUS TEST_GENOME2 300 bp DNA linear BCT 01-JAN-2024 +DEFINITION Test genome 2 for script testing. +ACCESSION TEST002 +VERSION TEST002.1 +KEYWORDS . +SOURCE Test organism 2 + ORGANISM Test organism 2 + Bacteria. +FEATURES Location/Qualifiers + source 1..300 + /organism="Test organism 2" + /mol_type="genomic DNA" + gene 10..72 + /gene="recA" + CDS 10..72 + /gene="recA" + /product="recombinase A" + /translation="MKRECOMBINAS" + gene 100..162 + /gene="dnaK" + CDS 100..162 + /gene="dnaK" + /product="molecular chaperone DnaK" + /translation="MKCHAPERONEA" +ORIGIN + 1 atgaaacgcg aacgaattca aatgtcgtga ttaacgctta gctagctagc tagctagcta + 61 gctagctagc tagctagcta gctagctagc tagctagcta gatgaaatgc catgccgagc + 121 gaaatgctga agctagctag ctagctagct agctagctag ctagctagct agctagctag + 181 ctagctagct agctagctat gctagctagt cgttcgattc gattcgatag ctagctagct + 241 agctagctag ctagctagct agctagctag ctagctagct agctagctag ctagctagct + 301 +// diff --git a/tests/test_scripts.py b/tests/test_scripts.py new file mode 100644 index 0000000..7b229d5 --- /dev/null +++ b/tests/test_scripts.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Test Suite for Bioinfo Python Scripts + +This script tests the functionality of cdsselector.py, +take genes into aminoacid.py, and take genes into nucleotides.py. + +Usage: + python tests/test_scripts.py +""" + +import os +import sys +import subprocess +import shutil +from pathlib import Path + + +class Colors: + """ANSI color codes for terminal output.""" + GREEN = '\033[92m' + RED = '\033[91m' + YELLOW = '\033[93m' + BLUE = '\033[94m' + BOLD = '\033[1m' + END = '\033[0m' + + +def print_header(text): + """Print a formatted header.""" + print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*70}{Colors.END}") + print(f"{Colors.BOLD}{Colors.BLUE}{text:^70}{Colors.END}") + print(f"{Colors.BOLD}{Colors.BLUE}{'='*70}{Colors.END}\n") + + +def print_success(text): + """Print success message.""" + print(f"{Colors.GREEN}✓ {text}{Colors.END}") + + +def print_error(text): + """Print error message.""" + print(f"{Colors.RED}✗ {text}{Colors.END}") + + +def print_info(text): + """Print info message.""" + print(f"{Colors.YELLOW}ℹ {text}{Colors.END}") + + +def run_command(cmd, description): + """ + Run a command and return success status. + + Args: + cmd (list): Command and arguments to run + description (str): Description of the test + + Returns: + bool: True if command succeeded, False otherwise + """ + print_info(f"Testing: {description}") + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30 + ) + if result.returncode == 0: + print_success(f"PASSED: {description}") + return True + else: + print_error(f"FAILED: {description}") + if result.stderr: + print(f" Error: {result.stderr[:200]}") + return False + except subprocess.TimeoutExpired: + print_error(f"TIMEOUT: {description}") + return False + except Exception as e: + print_error(f"ERROR: {description} - {e}") + return False + + +def verify_file_exists(filepath, description): + """Verify that a file exists.""" + if os.path.exists(filepath): + print_success(f"File exists: {description}") + return True + else: + print_error(f"File missing: {description}") + return False + + +def verify_folder_contents(folder, expected_count, description): + """Verify folder contains expected number of items.""" + if os.path.exists(folder): + count = len([f for f in os.listdir(folder) + if os.path.isfile(os.path.join(folder, f))]) + if count >= expected_count: + print_success(f"{description}: Found {count} files (expected >= {expected_count})") + return True + else: + print_error(f"{description}: Found {count} files (expected >= {expected_count})") + return False + else: + print_error(f"{description}: Folder does not exist") + return False + + +def test_cdsselector(): + """Test cdsselector.py script.""" + print_header("Testing cdsselector.py") + + script_dir = Path(__file__).parent.parent + test_data_dir = script_dir / "tests" / "test_data" + output_dir = script_dir / "tests" / "test_output" / "cdsselector" + + # Clean output directory + if output_dir.exists(): + shutil.rmtree(output_dir) + + tests_passed = 0 + tests_total = 0 + + # Test 1: Help command + tests_total += 1 + if run_command( + ["python", str(script_dir / "cdsselector.py"), "--help"], + "cdsselector.py --help" + ): + tests_passed += 1 + + # Test 2: Run with test data + tests_total += 1 + if run_command( + [ + "python", str(script_dir / "cdsselector.py"), + "--input-folder", str(test_data_dir / "input"), + "--genes-list", str(test_data_dir / "genes.txt"), + "--output-folder", str(output_dir) + ], + "cdsselector.py with test data" + ): + tests_passed += 1 + + # Test 3: Verify output files + tests_total += 1 + if verify_file_exists( + output_dir / "test_genome1.gbk", + "test_genome1.gbk output" + ): + tests_passed += 1 + + print(f"\n{Colors.BOLD}cdsselector.py: {tests_passed}/{tests_total} tests passed{Colors.END}") + return tests_passed, tests_total + + +def test_aminoacid_extractor(): + """Test take genes into aminoacid.py script.""" + print_header("Testing take genes into aminoacid.py") + + script_dir = Path(__file__).parent.parent + test_data_dir = script_dir / "tests" / "test_data" + output_dir = script_dir / "tests" / "test_output" / "aminoacid" + + # Clean output directory + if output_dir.exists(): + shutil.rmtree(output_dir) + + tests_passed = 0 + tests_total = 0 + + # Test 1: Help command + tests_total += 1 + if run_command( + ["python", str(script_dir / "take genes into aminoacid.py"), "--help"], + "take genes into aminoacid.py --help" + ): + tests_passed += 1 + + # Test 2: Run with test data + tests_total += 1 + if run_command( + [ + "python", str(script_dir / "take genes into aminoacid.py"), + "--input-folder", str(test_data_dir / "input"), + "--genes-list", str(test_data_dir / "genes.txt"), + "--output-folder", str(output_dir) + ], + "take genes into aminoacid.py with test data" + ): + tests_passed += 1 + + # Test 3: Verify gene folders exist + tests_total += 1 + dnaA_folder = output_dir / "dnaA" + if verify_file_exists(dnaA_folder, "dnaA gene folder"): + tests_passed += 1 + + # Test 4: Verify FASTA file exists + tests_total += 1 + fasta_files = list(dnaA_folder.glob("*.fasta")) + if len(fasta_files) > 0: + print_success(f"Found FASTA file(s) in dnaA folder: {[f.name for f in fasta_files]}") + tests_passed += 1 + else: + print_error("No FASTA files found in dnaA folder") + + print(f"\n{Colors.BOLD}take genes into aminoacid.py: {tests_passed}/{tests_total} tests passed{Colors.END}") + return tests_passed, tests_total + + +def test_nucleotide_extractor(): + """Test take genes into nucleotides.py script.""" + print_header("Testing take genes into nucleotides.py") + + script_dir = Path(__file__).parent.parent + test_data_dir = script_dir / "tests" / "test_data" + output_dir = script_dir / "tests" / "test_output" / "nucleotide" + + # Clean output directory + if output_dir.exists(): + shutil.rmtree(output_dir) + + tests_passed = 0 + tests_total = 0 + + # Test 1: Help command + tests_total += 1 + if run_command( + ["python", str(script_dir / "take genes into nucleotides.py"), "--help"], + "take genes into nucleotides.py --help" + ): + tests_passed += 1 + + # Test 2: Run with test data + tests_total += 1 + if run_command( + [ + "python", str(script_dir / "take genes into nucleotides.py"), + "--input-folder", str(test_data_dir / "input"), + "--genes-list", str(test_data_dir / "genes.txt"), + "--output-folder", str(output_dir) + ], + "take genes into nucleotides.py with test data" + ): + tests_passed += 1 + + # Test 3: Verify gene folders exist + tests_total += 1 + recA_folder = output_dir / "recA" + if verify_file_exists(recA_folder, "recA gene folder"): + tests_passed += 1 + + # Test 4: Verify FASTA file exists + tests_total += 1 + fasta_files = list(recA_folder.glob("*.fasta")) + if len(fasta_files) > 0: + print_success(f"Found FASTA file(s) in recA folder: {[f.name for f in fasta_files]}") + tests_passed += 1 + else: + print_error("No FASTA files found in recA folder") + + print(f"\n{Colors.BOLD}take genes into nucleotides.py: {tests_passed}/{tests_total} tests passed{Colors.END}") + return tests_passed, tests_total + + +def main(): + """Run all tests.""" + print_header("Bioinfo Python Scripts Test Suite") + + total_passed = 0 + total_tests = 0 + + # Run all tests + passed, total = test_cdsselector() + total_passed += passed + total_tests += total + + passed, total = test_aminoacid_extractor() + total_passed += passed + total_tests += total + + passed, total = test_nucleotide_extractor() + total_passed += passed + total_tests += total + + # Print final summary + print_header("Test Summary") + + success_rate = (total_passed / total_tests * 100) if total_tests > 0 else 0 + + if total_passed == total_tests: + print(f"{Colors.GREEN}{Colors.BOLD}ALL TESTS PASSED!{Colors.END}") + else: + print(f"{Colors.YELLOW}{Colors.BOLD}SOME TESTS FAILED{Colors.END}") + + print(f"\n{Colors.BOLD}Total: {total_passed}/{total_tests} tests passed ({success_rate:.1f}%){Colors.END}\n") + + # Return exit code + return 0 if total_passed == total_tests else 1 + + +if __name__ == "__main__": + sys.exit(main())