Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# Virtual environments
venv/
ENV/
env/

# IDEs
.vscode/
.idea/
*.swp
*.swo
*~

# OS
.DS_Store
Thumbs.db

# Test outputs
tests/test_output/
281 changes: 230 additions & 51 deletions cdsselector.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,237 @@
# -*- coding:utf-8 -*-
"""
Propouse:
This script removes CDS that arent on the allowed list,
with annoted GBK files you can create a GBK that only
contains CDS that are ´allowed´ by the list.

Usage:
python cdsselector.py $GBK_FOLDER $OUTDIR #LIST_FILE
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
CDS Selector

$GBK_FOLDER = Folder containing all gbk files that will be used as input
This script filters CDS (Coding Sequences) from annotated GenBank files,
keeping only those that are in an allowed gene list. The output is a new
GenBank file containing only the selected features.

$OUTDIR = Name of the output dir
Usage:
python cdsselector.py --input-folder <input_dir> --genes-list <genes_file>
--output-folder <output_dir>

$LIST_FILE = A file containt a gene on every line following this example:
dnaA
dnaK
rpoB
Arguments:
--input-folder : Folder containing GenBank (.gbk) files to process
--genes-list : File containing one gene name per line
--output-folder : Folder where filtered GenBank files will be saved

OBS: the list file shouldn't contain and special character.
Author: Davi J. Marcon
Email: davijosuemarcon@gmail.com
"""
from Bio import SeqIO
import sys

import argparse
import logging
import os
import sys
from Bio import SeqIO


def setup_logging():
"""Configure logging for the script."""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)


def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description='Filter CDS features from GenBank files based on a gene list.',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python cdsselector.py --input-folder gbk_files --genes-list genes.txt
--output-folder filtered_gbk
python cdsselector.py -i input_gbk -g allowed_genes.txt -o output_gbk
"""
)

parser.add_argument(
'--input-folder', '-i',
required=True,
metavar='DIR',
help='Input folder containing GenBank files'
)

parser.add_argument(
'--genes-list', '-g',
required=True,
metavar='FILE',
help='File containing gene names (one per line)'
)

parser.add_argument(
'--output-folder', '-o',
required=True,
metavar='DIR',
help='Output folder for filtered GenBank files'
)

return parser.parse_args()


def load_gene_list(genes_file):
"""
Load gene names from a file.

Args:
genes_file (str): Path to the file containing gene names

Returns:
list: List of gene names

Raises:
FileNotFoundError: If the gene list file does not exist
IOError: If there is an error reading the file
"""
if not os.path.exists(genes_file):
raise FileNotFoundError(f"Gene list file not found: {genes_file}")

try:
with open(genes_file, 'r') as f:
gene_list = [line.strip() for line in f if line.strip()]
logging.info(f"Loaded {len(gene_list)} genes from {genes_file}")
return gene_list
except IOError as e:
raise IOError(f"Error reading gene list file: {e}")


def validate_input_folder(input_folder):
"""
Validate that the input folder exists and contains GenBank files.

Args:
input_folder (str): Path to the input folder

Returns:
list: List of GenBank files in the folder

Raises:
FileNotFoundError: If the folder does not exist
ValueError: If no GenBank files are found
"""
if not os.path.exists(input_folder):
raise FileNotFoundError(f"Input folder not found: {input_folder}")

if not os.path.isdir(input_folder):
raise ValueError(f"Input path is not a directory: {input_folder}")

files = [f for f in os.listdir(input_folder)
if f.endswith(('.gbk', '.gb', '.genbank'))]

if not files:
raise ValueError(f"No GenBank files found in {input_folder}")

logging.info(f"Found {len(files)} GenBank file(s) in {input_folder}")
return files


def filter_features(seq_record, gene_list):
"""
Filter features from a sequence record based on the gene list.

Args:
seq_record (SeqRecord): BioPython SeqRecord object
gene_list (list): List of allowed gene names

Returns:
SeqRecord: SeqRecord with filtered features
"""
filtered_features = []

for feature in seq_record.features:
# Keep source features with genomic DNA mol_type
if feature.type == "source" and "mol_type" in feature.qualifiers:
if feature.qualifiers["mol_type"][0] == 'genomic DNA':
filtered_features.append(feature)

# Keep CDS and gene features that are in the gene list
elif feature.type in ["CDS", "gene"] and "gene" in feature.qualifiers:
gene_name = feature.qualifiers['gene'][0]
if gene_name in gene_list:
filtered_features.append(feature)

seq_record.features = filtered_features
return seq_record


def process_genbank_files(input_folder, output_folder, gene_list, input_files):
"""
Process all GenBank files in the input folder.

Args:
input_folder (str): Input folder path
output_folder (str): Output folder path
gene_list (list): List of allowed gene names
input_files (list): List of GenBank files to process
"""
processed_count = 0
error_count = 0

for input_file in input_files:
input_path = os.path.join(input_folder, input_file)
output_path = os.path.join(output_folder, input_file)

try:
logging.info(f"Processing: {input_file}")

# Parse and filter each sequence record
filtered_records = []
for seq_record in SeqIO.parse(input_path, 'genbank'):
filtered_record = filter_features(seq_record, gene_list)
filtered_records.append(filtered_record)

# Write filtered records to output file
SeqIO.write(filtered_records, output_path, 'genbank')
processed_count += 1
logging.info(f"Successfully processed: {input_file}")

except Exception as e:
error_count += 1
logging.error(f"Error processing {input_file}: {e}")

msg = f"Processing complete: {processed_count} succeeded, {error_count} failed"
logging.info(msg)


def main():
"""Main function to orchestrate the CDS selection process."""
setup_logging()

try:
# Parse command line arguments
args = parse_arguments()

logging.info("CDS Selector started")
logging.info(f"Input folder: {args.input_folder}")
logging.info(f"Genes list: {args.genes_list}")
logging.info(f"Output folder: {args.output_folder}")

# Load gene list
gene_list = load_gene_list(args.genes_list)

# Validate input folder and get list of files
input_files = validate_input_folder(args.input_folder)

# Create output folder if it doesn't exist
if not os.path.exists(args.output_folder):
os.makedirs(args.output_folder)
logging.info(f"Created output folder: {args.output_folder}")

# Process all GenBank files
process_genbank_files(args.input_folder, args.output_folder,
gene_list, input_files)

logging.info("CDS Selector finished successfully")
return 0

except Exception as e:
logging.error(f"Fatal error: {e}")
return 1


#I/O declaration

input_folder = sys.argv[1]
output_folder = sys.argv[2]
gene_list = [line.strip('\n') for line in open(sys.argv[3])]

#Taking permission for parsing later
for i in os.listdir(input_folder):
open(input_folder+"/"+i)

#Check output folder existance
if not os.path.exists(output_folder):
os.makedirs(output_folder)

#Code
for input_file in os.listdir(input_folder):
FEATURES = []
for seq_record in SeqIO.parse(input_folder+"/"+input_file , 'genbank'):# Parse all files and interact around them
for feature in seq_record.features: # Check each feature and see if it is on gene_list
if feature.type == "CDS" and "gene" in feature.qualifiers:
gene = feature.qualifiers['gene'][0]
if gene in gene_list:
FEATURES.append(feature)
if feature.type == "gene" and "gene" in feature.qualifiers:
gene = feature.qualifiers['gene'][0]
if gene in gene_list:
FEATURES.append(feature)
if feature.type == "source" and "mol_type" in feature.qualifiers:
if feature.qualifiers["mol_type"][0] == 'genomic DNA':
FEATURES.append(feature)# If the feature is on the gene_list append them to the new feature list
seq_record.features = FEATURES
print("Creating input for: "+input_file)
SeqIO.write (seq_record, output_folder+"/"+input_file, 'genbank')
if __name__ == "__main__":
sys.exit(main())
Loading