From 3c5d4000e5c738da00f61725c6ba6242e683e346 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 9 Oct 2025 13:31:18 +0000 Subject: [PATCH 1/4] Initial plan From 7fcc2fb490881d5f8e5acde9b48761b63f55546e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 9 Oct 2025 13:39:24 +0000 Subject: [PATCH 2/4] Refactor cdsselector.py to be more professional with argparse - Replace sys.argv with argparse for proper argument parsing - Add --input-folder, --genes-list, and --output-folder parameters (with short flags) - Implement proper error handling and validation - Add logging instead of simple print statements - Follow PEP 8 style guidelines - Add main() function with if __name__ == "__main__" guard - Fix logic bug where FEATURES list was reset incorrectly - Use os.path.join() for path operations - Add comprehensive docstrings for all functions - Add shebang line for better Unix compatibility Co-authored-by: Mxrcon <48180517+Mxrcon@users.noreply.github.com> --- __pycache__/cdsselector.cpython-312.pyc | Bin 0 -> 9237 bytes cdsselector.py | 281 +++++++++++++++++++----- 2 files changed, 230 insertions(+), 51 deletions(-) create mode 100644 __pycache__/cdsselector.cpython-312.pyc diff --git a/__pycache__/cdsselector.cpython-312.pyc b/__pycache__/cdsselector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21c66f8b12f72e0aa77b846cb5f24f3b3349f99d GIT binary patch literal 9237 zcmcIqTTC2TdOp=v)%P1V4ULU)VH-?iV0`z0@r*GSg9jVSp3KNzPg>=!0=nIFF;#^b zT8)=ot+b#x!p@5s&5nkBS|V>0A|(n@@-Ua3NYTD@8ys3SL>eiUqJ3krwb}K&o#(wuhZnpTxZ})C)qLkbvuOw=4a@-LtvqH3U;Vnf&*%|;Dp*ExS;k5 zZm4~tU+~;&LP!qVkjNzJOaTg4X!kud9v@Qr4TYv`Q?`hg=WPhNyL+y3SH+|lQzhB$ zes?ska7s*$kE`5pJgJJZ!eNT0Zb^uzMu76e2{9EDl?XR1OJf|LN=YiOiUN0DOr7IX zw{R~}X>s2Y#c|w0N+qW_byQMBsCkv+Ws!@gpdZI4lhPz~91&9@my9cF8~5(0$Vn6G z_=L)V7(AB}C*8$-xtNqv`FN^`vB)VzcwjUv^6G>vDs67}Unu;D*x}}&j8DPnDJ~`` zI-w-DjZbl{t??8L(>g3A1ySZY$x~E_%WuIeERoWRMdCWieH2d$ztM%Wdg+g(o7>$b zk4%h-DOJ(OEgJzO>EQbG(TZka&aSC#B88iQG6$DH%_RM)y*wiv*b*6=6AGP*~^kCFCaK$s~6Z^vmBC1(@Ch=uz(AdidLM z?m`=Pk(Xf}?%pvzp6uWRXiZ4Ugg8c;&yL{cwwN^L{vF30=u*I1Oo38JcQp52P*c4n+8X3Xc zW>}J8Bf_l69&bxW8k%?|rpCv_h{CmU4NXb$wwT0PG@i#`8DKL-lypVIwbq8Q)&_xV z=>0Ec0COuYoJ9LAWM=O(YjW4bIFGjOSI=x@9*8-il|PEM~#lMA87W#Pr=m;dct_tnd%%D-gB9k0d`;$j`1mh90-6PIaGY$WaF+Xlgp_R67P<6Pi@XqyXvXSGIct> zSojR$+Tlm=gqEVuP4<1&c0YUyG$W815p^)Ez z*B9WYd=AwNdRB|P{`pgLr{>?9duyS2&C{^q-<#dnxpIAFFv|_B`Y&Xi7d9C?+qvPZ zxYxW8TlMYRM3k$STIBObkNs!3XX(JQ`p@ZqNdHr2q4Kj`AMg6KHXH7l9f0b8X-xOT z8JPFZdGAgBLRoD8R9!k&N@$jw@Jf`=Sm+;;_toZBIsY)b9?A7Q0QFP{zcUeTsPTp zOLHr{{4$K+ez}J|*Xa0ip98lJ3if}N!SAT=S3?nLj#d8oslT^YKpdFoNa9EqBp-wy7 z0GDn;h9JG`Qu&Z%I6U{XouE9It}GF%3DOPlHsX=%VSC8?a0S*L$p@g)*ti0tYVN|s zAoK$G!;MZcB`A||b(FxUd=Phngb4zN)LXoy@*%yAl&#J#S z>+FSC@#lj-8N7G$7yB2)Pns8MvkmPlEh}}|ivBfbAnysz4E|{=0DKhe$bE5iS2x4N zilc4@nz16DUM8S7f_*ELfQItYgFg6cfC@T8Z|C_56iHB>1W2N{i`#F*dTjqju+_j& z0!?ZLa{=$|BzSL@uUL>IV*wI)Q2j5FguRV9ybo~l5=r3g@b>QndqI|ICc|VX!fX0_ zNHfo%3}`ND37bI>ojUjm6SBbk<1n-|1mz*ErDxi}nTScUz*%7^#s_TT^_&?wq@*hK z;Glcd=-C8`mn(z`_;Kgr(q%m(X0>wC%>FTi2D1ha%A7Z0pxG5^LXL@=b4*G`)hP(^@SaS2KqhG{9^-m0 zbwz9%4Jk>D!E(x-@H)b1PLqws;9i>h!wERH!*R%8YIc4c+$jP3ONylxxfMEU6{Z+b ztG~mzGyEtr%Q7b_p!c%+$i2Ypr|?t$5vm!qS%qqLXPMBqJNK;bY|QO!+(K^l=&a*g z&yM-dxz2?*A9?mKxtB-QPxj~Fe-pSpRqKKJT%dljITvVM;&Xw+%NJJzUHM@Bdax-M zY+9;*96YjVN09fqF?VAj`pDO^T(xp){q)7$>5Gq!T+D~}u7{g*;pU~2x$x2DBf0R2 z*#U5-76u*#8p^nYK$gug+TAj9VU4NIdqNk<*@xD`9yvg`qQ^hj6ZO0U)YpP|0J(C_8 zMqttqWbbGybG!Rn80*!2TsUYH+!+>X&!=ob&M7AZo0Yl@2tlxUXxf`$4N2G{ zu2m*Y;R;|1S{57`FX*}`$i0RH=d|~Z_a1dOtlt#{r(V9=1i#CAo*N@eiN#k;Ply;@ z!+rZf&e=e*+k$-*0x&odGP3rizFcXtsD%E{!b8JZ!2ZMwFnYekwxDq6ay)1 zh&UY}0(PB#hxb-Ge03s*KMo8}B;nhN@ufh7$TVG{ho43hPIsqm9o(gKsK5=ULm_La zia_q=6wFD4K(>WTJGf!U*^5HNTkx_pCLzUBMZOc>sv4`OwlM0+XGz1P99JRo?ENSv zj;lD6$dceX;*(F{_r4O9>(J{d(hy|E^nD--(HKP8s^%g6jR$vuR5R{L1nn`{l82xL z<+|BZPc_qHOum4^kALQ192Xn!LOM%D1H#b=zc;-gF9rN$Zy^{?$Egf2(TJv`0 zy}K7a%z3%lA!DrU?qkarmOHaGXCHgJHtVs}&sq@c#eQOTXVcQ&Y*YJ6?dP{X{~&wi zTDIc)W9CPHeC9&I8V~^_&U>pDj^wu*dX;=bXL!uk;2 z?G2Pf$ZVZPs%2#om`7O=r~<~lh7!BB+t`p&`|%ncH1HtbgiFQvK|CO0-p($S@hDiN zZAUa`Jxvu-lH1SEqO=u~jS0&V6y}krF5H`U+e>P4RJsc3%0$@gQ@FmmHzZqMy|R#0 z{T_PSZH~A7?6>EUfh@*D!xMU~9CmZQ@WIgmRuujcKrdiL_f6A9v)Ja_yx`2xr7H_o z(=|=sp{LyXwa8|A(x)!Mmv3%(A_ZS7;cFC_4M~oTijXAK^Bp4jlBIv-#smhIwWTYI zKk;*l%FFP*KJ70ykk}Z2RRUXuJ$`r!l-NUPsP2YDDgMr{W3FM5;l;iZ?!)kLUKs^4 zVp$9urJFQgAFuLBJ?jcaN$v+tMS|oTC;1RkWMzoaFp`^CAo&u|l3b;XVZHHGuJKg1;mvHQea+cnzR4{1{A%cz zL)rbuv!N4f&XZ>Qj~CVTgYCJ4?b*hTZ0Piwv-6o9h4wEV|JB>Se0#O>*sKelt6P?O z)?3cxTF$Iiy*2BBmf(*0$+^k-J9Bpyg!Q_^xw^w^fg>A`s|CnkV07Pq&3ni~`lHLo z*N^q)j`e1b_GN3%uX+1{iK`BGej``+#&YHQk?!1)?$57fFaK!GdBd6r#hQ&qBSFm(jS5mM8kKPmeWB~&PNS76MWZ1ml`*;(?S(h+BUx+WWnc?TmI!mj zAz3u4F_6qtWt?p!(FZhuy&aLCSsfw-6T~yUp$A0PtwKYg_RK;c=g&6N_`Vn1| cXS_4rKkxrZ|J=KO_1 --genes-list + --output-folder - $LIST_FILE = A file containt a gene on every line following this example: - dnaA - dnaK - rpoB +Arguments: + --input-folder : Folder containing GenBank (.gbk) files to process + --genes-list : File containing one gene name per line + --output-folder : Folder where filtered GenBank files will be saved - OBS: the list file shouldn't contain and special character. +Author: Davi J. Marcon +Email: davijosuemarcon@gmail.com """ -from Bio import SeqIO -import sys + +import argparse +import logging import os +import sys +from Bio import SeqIO + + +def setup_logging(): + """Configure logging for the script.""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Filter CDS features from GenBank files based on a gene list.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python cdsselector.py --input-folder gbk_files --genes-list genes.txt + --output-folder filtered_gbk + python cdsselector.py -i input_gbk -g allowed_genes.txt -o output_gbk + """ + ) + + parser.add_argument( + '--input-folder', '-i', + required=True, + metavar='DIR', + help='Input folder containing GenBank files' + ) + + parser.add_argument( + '--genes-list', '-g', + required=True, + metavar='FILE', + help='File containing gene names (one per line)' + ) + + parser.add_argument( + '--output-folder', '-o', + required=True, + metavar='DIR', + help='Output folder for filtered GenBank files' + ) + + return parser.parse_args() + + +def load_gene_list(genes_file): + """ + Load gene names from a file. + + Args: + genes_file (str): Path to the file containing gene names + + Returns: + list: List of gene names + + Raises: + FileNotFoundError: If the gene list file does not exist + IOError: If there is an error reading the file + """ + if not os.path.exists(genes_file): + raise FileNotFoundError(f"Gene list file not found: {genes_file}") + + try: + with open(genes_file, 'r') as f: + gene_list = [line.strip() for line in f if line.strip()] + logging.info(f"Loaded {len(gene_list)} genes from {genes_file}") + return gene_list + except IOError as e: + raise IOError(f"Error reading gene list file: {e}") + + +def validate_input_folder(input_folder): + """ + Validate that the input folder exists and contains GenBank files. + + Args: + input_folder (str): Path to the input folder + + Returns: + list: List of GenBank files in the folder + + Raises: + FileNotFoundError: If the folder does not exist + ValueError: If no GenBank files are found + """ + if not os.path.exists(input_folder): + raise FileNotFoundError(f"Input folder not found: {input_folder}") + + if not os.path.isdir(input_folder): + raise ValueError(f"Input path is not a directory: {input_folder}") + + files = [f for f in os.listdir(input_folder) + if f.endswith(('.gbk', '.gb', '.genbank'))] + + if not files: + raise ValueError(f"No GenBank files found in {input_folder}") + + logging.info(f"Found {len(files)} GenBank file(s) in {input_folder}") + return files + + +def filter_features(seq_record, gene_list): + """ + Filter features from a sequence record based on the gene list. + + Args: + seq_record (SeqRecord): BioPython SeqRecord object + gene_list (list): List of allowed gene names + + Returns: + SeqRecord: SeqRecord with filtered features + """ + filtered_features = [] + + for feature in seq_record.features: + # Keep source features with genomic DNA mol_type + if feature.type == "source" and "mol_type" in feature.qualifiers: + if feature.qualifiers["mol_type"][0] == 'genomic DNA': + filtered_features.append(feature) + + # Keep CDS and gene features that are in the gene list + elif feature.type in ["CDS", "gene"] and "gene" in feature.qualifiers: + gene_name = feature.qualifiers['gene'][0] + if gene_name in gene_list: + filtered_features.append(feature) + + seq_record.features = filtered_features + return seq_record + + +def process_genbank_files(input_folder, output_folder, gene_list, input_files): + """ + Process all GenBank files in the input folder. + + Args: + input_folder (str): Input folder path + output_folder (str): Output folder path + gene_list (list): List of allowed gene names + input_files (list): List of GenBank files to process + """ + processed_count = 0 + error_count = 0 + + for input_file in input_files: + input_path = os.path.join(input_folder, input_file) + output_path = os.path.join(output_folder, input_file) + + try: + logging.info(f"Processing: {input_file}") + + # Parse and filter each sequence record + filtered_records = [] + for seq_record in SeqIO.parse(input_path, 'genbank'): + filtered_record = filter_features(seq_record, gene_list) + filtered_records.append(filtered_record) + + # Write filtered records to output file + SeqIO.write(filtered_records, output_path, 'genbank') + processed_count += 1 + logging.info(f"Successfully processed: {input_file}") + + except Exception as e: + error_count += 1 + logging.error(f"Error processing {input_file}: {e}") + + msg = f"Processing complete: {processed_count} succeeded, {error_count} failed" + logging.info(msg) + + +def main(): + """Main function to orchestrate the CDS selection process.""" + setup_logging() + + try: + # Parse command line arguments + args = parse_arguments() + + logging.info("CDS Selector started") + logging.info(f"Input folder: {args.input_folder}") + logging.info(f"Genes list: {args.genes_list}") + logging.info(f"Output folder: {args.output_folder}") + + # Load gene list + gene_list = load_gene_list(args.genes_list) + + # Validate input folder and get list of files + input_files = validate_input_folder(args.input_folder) + + # Create output folder if it doesn't exist + if not os.path.exists(args.output_folder): + os.makedirs(args.output_folder) + logging.info(f"Created output folder: {args.output_folder}") + + # Process all GenBank files + process_genbank_files(args.input_folder, args.output_folder, + gene_list, input_files) + + logging.info("CDS Selector finished successfully") + return 0 + + except Exception as e: + logging.error(f"Fatal error: {e}") + return 1 + -#I/O declaration - -input_folder = sys.argv[1] -output_folder = sys.argv[2] -gene_list = [line.strip('\n') for line in open(sys.argv[3])] - -#Taking permission for parsing later -for i in os.listdir(input_folder): - open(input_folder+"/"+i) - -#Check output folder existance -if not os.path.exists(output_folder): - os.makedirs(output_folder) - -#Code -for input_file in os.listdir(input_folder): - FEATURES = [] - for seq_record in SeqIO.parse(input_folder+"/"+input_file , 'genbank'):# Parse all files and interact around them - for feature in seq_record.features: # Check each feature and see if it is on gene_list - if feature.type == "CDS" and "gene" in feature.qualifiers: - gene = feature.qualifiers['gene'][0] - if gene in gene_list: - FEATURES.append(feature) - if feature.type == "gene" and "gene" in feature.qualifiers: - gene = feature.qualifiers['gene'][0] - if gene in gene_list: - FEATURES.append(feature) - if feature.type == "source" and "mol_type" in feature.qualifiers: - if feature.qualifiers["mol_type"][0] == 'genomic DNA': - FEATURES.append(feature)# If the feature is on the gene_list append them to the new feature list - seq_record.features = FEATURES - print("Creating input for: "+input_file) - SeqIO.write (seq_record, output_folder+"/"+input_file, 'genbank') +if __name__ == "__main__": + sys.exit(main()) From e630cb5fcae851f31d21a4d42a1bb8934d46ba47 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 9 Oct 2025 13:40:04 +0000 Subject: [PATCH 3/4] Add .gitignore to exclude Python cache files Co-authored-by: Mxrcon <48180517+Mxrcon@users.noreply.github.com> --- .gitignore | 38 ++++++++++++++++++++++++ __pycache__/cdsselector.cpython-312.pyc | Bin 9237 -> 0 bytes 2 files changed, 38 insertions(+) create mode 100644 .gitignore delete mode 100644 __pycache__/cdsselector.cpython-312.pyc diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fcf176a --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db diff --git a/__pycache__/cdsselector.cpython-312.pyc b/__pycache__/cdsselector.cpython-312.pyc deleted file mode 100644 index 21c66f8b12f72e0aa77b846cb5f24f3b3349f99d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9237 zcmcIqTTC2TdOp=v)%P1V4ULU)VH-?iV0`z0@r*GSg9jVSp3KNzPg>=!0=nIFF;#^b zT8)=ot+b#x!p@5s&5nkBS|V>0A|(n@@-Ua3NYTD@8ys3SL>eiUqJ3krwb}K&o#(wuhZnpTxZ})C)qLkbvuOw=4a@-LtvqH3U;Vnf&*%|;Dp*ExS;k5 zZm4~tU+~;&LP!qVkjNzJOaTg4X!kud9v@Qr4TYv`Q?`hg=WPhNyL+y3SH+|lQzhB$ zes?ska7s*$kE`5pJgJJZ!eNT0Zb^uzMu76e2{9EDl?XR1OJf|LN=YiOiUN0DOr7IX zw{R~}X>s2Y#c|w0N+qW_byQMBsCkv+Ws!@gpdZI4lhPz~91&9@my9cF8~5(0$Vn6G z_=L)V7(AB}C*8$-xtNqv`FN^`vB)VzcwjUv^6G>vDs67}Unu;D*x}}&j8DPnDJ~`` zI-w-DjZbl{t??8L(>g3A1ySZY$x~E_%WuIeERoWRMdCWieH2d$ztM%Wdg+g(o7>$b zk4%h-DOJ(OEgJzO>EQbG(TZka&aSC#B88iQG6$DH%_RM)y*wiv*b*6=6AGP*~^kCFCaK$s~6Z^vmBC1(@Ch=uz(AdidLM z?m`=Pk(Xf}?%pvzp6uWRXiZ4Ugg8c;&yL{cwwN^L{vF30=u*I1Oo38JcQp52P*c4n+8X3Xc zW>}J8Bf_l69&bxW8k%?|rpCv_h{CmU4NXb$wwT0PG@i#`8DKL-lypVIwbq8Q)&_xV z=>0Ec0COuYoJ9LAWM=O(YjW4bIFGjOSI=x@9*8-il|PEM~#lMA87W#Pr=m;dct_tnd%%D-gB9k0d`;$j`1mh90-6PIaGY$WaF+Xlgp_R67P<6Pi@XqyXvXSGIct> zSojR$+Tlm=gqEVuP4<1&c0YUyG$W815p^)Ez z*B9WYd=AwNdRB|P{`pgLr{>?9duyS2&C{^q-<#dnxpIAFFv|_B`Y&Xi7d9C?+qvPZ zxYxW8TlMYRM3k$STIBObkNs!3XX(JQ`p@ZqNdHr2q4Kj`AMg6KHXH7l9f0b8X-xOT z8JPFZdGAgBLRoD8R9!k&N@$jw@Jf`=Sm+;;_toZBIsY)b9?A7Q0QFP{zcUeTsPTp zOLHr{{4$K+ez}J|*Xa0ip98lJ3if}N!SAT=S3?nLj#d8oslT^YKpdFoNa9EqBp-wy7 z0GDn;h9JG`Qu&Z%I6U{XouE9It}GF%3DOPlHsX=%VSC8?a0S*L$p@g)*ti0tYVN|s zAoK$G!;MZcB`A||b(FxUd=Phngb4zN)LXoy@*%yAl&#J#S z>+FSC@#lj-8N7G$7yB2)Pns8MvkmPlEh}}|ivBfbAnysz4E|{=0DKhe$bE5iS2x4N zilc4@nz16DUM8S7f_*ELfQItYgFg6cfC@T8Z|C_56iHB>1W2N{i`#F*dTjqju+_j& z0!?ZLa{=$|BzSL@uUL>IV*wI)Q2j5FguRV9ybo~l5=r3g@b>QndqI|ICc|VX!fX0_ zNHfo%3}`ND37bI>ojUjm6SBbk<1n-|1mz*ErDxi}nTScUz*%7^#s_TT^_&?wq@*hK z;Glcd=-C8`mn(z`_;Kgr(q%m(X0>wC%>FTi2D1ha%A7Z0pxG5^LXL@=b4*G`)hP(^@SaS2KqhG{9^-m0 zbwz9%4Jk>D!E(x-@H)b1PLqws;9i>h!wERH!*R%8YIc4c+$jP3ONylxxfMEU6{Z+b ztG~mzGyEtr%Q7b_p!c%+$i2Ypr|?t$5vm!qS%qqLXPMBqJNK;bY|QO!+(K^l=&a*g z&yM-dxz2?*A9?mKxtB-QPxj~Fe-pSpRqKKJT%dljITvVM;&Xw+%NJJzUHM@Bdax-M zY+9;*96YjVN09fqF?VAj`pDO^T(xp){q)7$>5Gq!T+D~}u7{g*;pU~2x$x2DBf0R2 z*#U5-76u*#8p^nYK$gug+TAj9VU4NIdqNk<*@xD`9yvg`qQ^hj6ZO0U)YpP|0J(C_8 zMqttqWbbGybG!Rn80*!2TsUYH+!+>X&!=ob&M7AZo0Yl@2tlxUXxf`$4N2G{ zu2m*Y;R;|1S{57`FX*}`$i0RH=d|~Z_a1dOtlt#{r(V9=1i#CAo*N@eiN#k;Ply;@ z!+rZf&e=e*+k$-*0x&odGP3rizFcXtsD%E{!b8JZ!2ZMwFnYekwxDq6ay)1 zh&UY}0(PB#hxb-Ge03s*KMo8}B;nhN@ufh7$TVG{ho43hPIsqm9o(gKsK5=ULm_La zia_q=6wFD4K(>WTJGf!U*^5HNTkx_pCLzUBMZOc>sv4`OwlM0+XGz1P99JRo?ENSv zj;lD6$dceX;*(F{_r4O9>(J{d(hy|E^nD--(HKP8s^%g6jR$vuR5R{L1nn`{l82xL z<+|BZPc_qHOum4^kALQ192Xn!LOM%D1H#b=zc;-gF9rN$Zy^{?$Egf2(TJv`0 zy}K7a%z3%lA!DrU?qkarmOHaGXCHgJHtVs}&sq@c#eQOTXVcQ&Y*YJ6?dP{X{~&wi zTDIc)W9CPHeC9&I8V~^_&U>pDj^wu*dX;=bXL!uk;2 z?G2Pf$ZVZPs%2#om`7O=r~<~lh7!BB+t`p&`|%ncH1HtbgiFQvK|CO0-p($S@hDiN zZAUa`Jxvu-lH1SEqO=u~jS0&V6y}krF5H`U+e>P4RJsc3%0$@gQ@FmmHzZqMy|R#0 z{T_PSZH~A7?6>EUfh@*D!xMU~9CmZQ@WIgmRuujcKrdiL_f6A9v)Ja_yx`2xr7H_o z(=|=sp{LyXwa8|A(x)!Mmv3%(A_ZS7;cFC_4M~oTijXAK^Bp4jlBIv-#smhIwWTYI zKk;*l%FFP*KJ70ykk}Z2RRUXuJ$`r!l-NUPsP2YDDgMr{W3FM5;l;iZ?!)kLUKs^4 zVp$9urJFQgAFuLBJ?jcaN$v+tMS|oTC;1RkWMzoaFp`^CAo&u|l3b;XVZHHGuJKg1;mvHQea+cnzR4{1{A%cz zL)rbuv!N4f&XZ>Qj~CVTgYCJ4?b*hTZ0Piwv-6o9h4wEV|JB>Se0#O>*sKelt6P?O z)?3cxTF$Iiy*2BBmf(*0$+^k-J9Bpyg!Q_^xw^w^fg>A`s|CnkV07Pq&3ni~`lHLo z*N^q)j`e1b_GN3%uX+1{iK`BGej``+#&YHQk?!1)?$57fFaK!GdBd6r#hQ&qBSFm(jS5mM8kKPmeWB~&PNS76MWZ1ml`*;(?S(h+BUx+WWnc?TmI!mj zAz3u4F_6qtWt?p!(FZhuy&aLCSsfw-6T~yUp$A0PtwKYg_RK;c=g&6N_`Vn1| cXS_4rKkxrZ|J=KO_1 Date: Thu, 9 Oct 2025 13:50:52 +0000 Subject: [PATCH 4/4] Modernize gene extraction scripts and add test suite - Update "take genes into aminoacid.py" with argparse and progress tracking - Update "take genes into nucleotides.py" with argparse and progress tracking - Add professional structure matching cdsselector.py - Create comprehensive test suite in tests/ folder - Add test data with sample GenBank files - Add progress output showing "Processing file X/Y" - All 11 tests passing (100%) Co-authored-by: Mxrcon <48180517+Mxrcon@users.noreply.github.com> --- .gitignore | 3 + take genes into aminoacid.py | 263 ++++++++++++++++++--- take genes into nucleotides.py | 264 ++++++++++++++++++--- tests/README.md | 121 ++++++++++ tests/test_data/genes.txt | 4 + tests/test_data/input/test_genome1.gbk | 38 +++ tests/test_data/input/test_genome2.gbk | 32 +++ tests/test_scripts.py | 308 +++++++++++++++++++++++++ 8 files changed, 979 insertions(+), 54 deletions(-) create mode 100644 tests/README.md create mode 100644 tests/test_data/genes.txt create mode 100644 tests/test_data/input/test_genome1.gbk create mode 100644 tests/test_data/input/test_genome2.gbk create mode 100644 tests/test_scripts.py diff --git a/.gitignore b/.gitignore index fcf176a..441e291 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,6 @@ env/ # OS .DS_Store Thumbs.db + +# Test outputs +tests/test_output/ diff --git a/take genes into aminoacid.py b/take genes into aminoacid.py index bd206cc..71edb61 100644 --- a/take genes into aminoacid.py +++ b/take genes into aminoacid.py @@ -1,32 +1,241 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Gene to Amino Acid Extractor -from Bio import SeqIO -import sys +This script extracts CDS (Coding Sequences) from GenBank files and outputs +the translated amino acid sequences in FASTA format. Genes are filtered +based on a provided list, and sequences are organized by gene name. + +Usage: + python "take genes into aminoacid.py" --input-folder + --genes-list + --output-folder + +Arguments: + --input-folder : Folder containing GenBank (.gbk) files to process + --genes-list : File containing one gene name per line + --output-folder : Folder where amino acid FASTA files will be saved + +Author: Davi J. Marcon +Email: davijosuemarcon@gmail.com +""" + +import argparse +import logging import os -#Determinando os inputs e outputs -entrada = sys.argv[1] -saida = sys.argv[2] -Lista = ["recA","rpoB"] -#abrindo os inputs -for i in os.listdir(entrada): - open(entrada+"/"+i) -#conta o numero de gbs -list = os.listdir(entrada) -number_files = len(list) -print("Foram encontrados" , (number_files), "arquivos") -#cria a pasta de outputs -if not os.path.exists(saida): - os.makedirs(saida) -#Executando código -for i in os.listdir(entrada): - for seq_record in SeqIO.parse(entrada+"/"+i , 'genbank'): - for feature in seq_record.features: - if feature.type == "CDS" and "gene" in feature.qualifiers: - gene = feature.qualifiers['gene'][0] - if gene in Lista: - if not os.path.exists(saida+"/"+gene): - os.makedirs(saida+"/"+gene) - with open(saida+"/"+gene+"/"+seq_record.id+".fasta", "w") as ofile: - ofile.write(">{0}\n{1}\n".format(seq_record.id, feature.qualifiers['translation'][0])) +import sys +from Bio import SeqIO + + +def setup_logging(): + """Configure logging for the script.""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Extract amino acid sequences from GenBank files.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python "take genes into aminoacid.py" --input-folder gbk_files + --genes-list genes.txt + --output-folder amino_acids + python "take genes into aminoacid.py" -i gbk_files -g genes.txt -o aa_out + """ + ) + + parser.add_argument( + '--input-folder', '-i', + required=True, + metavar='DIR', + help='Input folder containing GenBank files' + ) + + parser.add_argument( + '--genes-list', '-g', + required=True, + metavar='FILE', + help='File containing gene names (one per line)' + ) + + parser.add_argument( + '--output-folder', '-o', + required=True, + metavar='DIR', + help='Output folder for amino acid FASTA files' + ) + + return parser.parse_args() + + +def load_gene_list(genes_file): + """ + Load gene names from a file. + + Args: + genes_file (str): Path to the file containing gene names + + Returns: + list: List of gene names + + Raises: + FileNotFoundError: If the gene list file does not exist + IOError: If there is an error reading the file + """ + if not os.path.exists(genes_file): + raise FileNotFoundError(f"Gene list file not found: {genes_file}") + + try: + with open(genes_file, 'r') as f: + gene_list = [line.strip() for line in f if line.strip()] + logging.info(f"Loaded {len(gene_list)} genes from {genes_file}") + return gene_list + except IOError as e: + raise IOError(f"Error reading gene list file: {e}") + + +def validate_input_folder(input_folder): + """ + Validate that the input folder exists and contains GenBank files. + + Args: + input_folder (str): Path to the input folder + + Returns: + list: List of GenBank files in the folder + + Raises: + FileNotFoundError: If the folder does not exist + ValueError: If no GenBank files are found + """ + if not os.path.exists(input_folder): + raise FileNotFoundError(f"Input folder not found: {input_folder}") + + if not os.path.isdir(input_folder): + raise ValueError(f"Input path is not a directory: {input_folder}") + + files = [f for f in os.listdir(input_folder) + if f.endswith(('.gbk', '.gb', '.genbank'))] + + if not files: + raise ValueError(f"No GenBank files found in {input_folder}") + + logging.info(f"Found {len(files)} GenBank file(s) in {input_folder}") + return files + + +def process_genbank_files(input_folder, output_folder, gene_list, input_files): + """ + Process all GenBank files and extract amino acid sequences. + + Args: + input_folder (str): Input folder path + output_folder (str): Output folder path + gene_list (list): List of gene names to extract + input_files (list): List of GenBank files to process + """ + processed_count = 0 + error_count = 0 + extracted_count = 0 + + for idx, input_file in enumerate(input_files, 1): + input_path = os.path.join(input_folder, input_file) + + try: + logging.info(f"Processing file {idx}/{len(input_files)}: {input_file}") + + for seq_record in SeqIO.parse(input_path, 'genbank'): + for feature in seq_record.features: + if feature.type == "CDS" and "gene" in feature.qualifiers: + gene = feature.qualifiers['gene'][0] + if gene in gene_list: + # Check if translation exists + if 'translation' not in feature.qualifiers: + logging.warning( + f"No translation for gene {gene} in " + f"{input_file}" + ) + continue + + # Create gene-specific output folder + gene_folder = os.path.join(output_folder, gene) + if not os.path.exists(gene_folder): + os.makedirs(gene_folder) + logging.debug(f"Created folder: {gene_folder}") + + # Write amino acid sequence + output_file = os.path.join( + gene_folder, + f"{seq_record.id}.fasta" + ) + with open(output_file, "w") as ofile: + translation = feature.qualifiers['translation'][0] + ofile.write(f">{seq_record.id}\n{translation}\n") + + extracted_count += 1 + + processed_count += 1 + + except Exception as e: + error_count += 1 + logging.error(f"Error processing {input_file}: {e}") + + logging.info( + f"Processing complete: {processed_count} files succeeded, " + f"{error_count} files failed" + ) + logging.info(f"Extracted {extracted_count} amino acid sequences") + + +def main(): + """Main function to orchestrate the amino acid extraction process.""" + setup_logging() + + try: + # Parse command line arguments + args = parse_arguments() + + logging.info("Gene to Amino Acid Extractor started") + logging.info(f"Input folder: {args.input_folder}") + logging.info(f"Genes list: {args.genes_list}") + logging.info(f"Output folder: {args.output_folder}") + + # Load gene list + gene_list = load_gene_list(args.genes_list) + + # Validate input folder and get list of files + input_files = validate_input_folder(args.input_folder) + + # Create output folder if it doesn't exist + if not os.path.exists(args.output_folder): + os.makedirs(args.output_folder) + logging.info(f"Created output folder: {args.output_folder}") + + # Process all GenBank files + process_genbank_files( + args.input_folder, + args.output_folder, + gene_list, + input_files + ) + + logging.info("Gene to Amino Acid Extractor finished successfully") + return 0 + + except Exception as e: + logging.error(f"Fatal error: {e}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/take genes into nucleotides.py b/take genes into nucleotides.py index 458b339..7da96ae 100644 --- a/take genes into nucleotides.py +++ b/take genes into nucleotides.py @@ -1,32 +1,242 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Gene to Nucleotide Extractor -from Bio import SeqIO -import sys +This script extracts CDS (Coding Sequences) from GenBank files and outputs +the nucleotide sequences in FASTA format. Genes are filtered based on a +provided list, and sequences are organized by gene name. + +Usage: + python "take genes into nucleotides.py" --input-folder + --genes-list + --output-folder + +Arguments: + --input-folder : Folder containing GenBank (.gbk) files to process + --genes-list : File containing one gene name per line + --output-folder : Folder where nucleotide FASTA files will be saved + +Author: Davi J. Marcon +Email: davijosuemarcon@gmail.com +""" + +import argparse +import logging import os -#Determinando os inputs e outputs -entrada = sys.argv[1] -saida = sys.argv[2] -Lista = ["recA","rpoB"] -#abrindo os inputs -for i in os.listdir(entrada): - open(entrada+"/"+i) -#conta o numero de gbs -list = os.listdir(entrada) -number_files = len(list) -print("Foram encontrados" , (number_files), "arquivos") -#cria a pasta de outputs -if not os.path.exists(saida): - os.makedirs(saida) -#Executando código -for i in os.listdir(entrada): - for seq_record in SeqIO.parse(entrada+"/"+i , 'genbank'): - for feature in seq_record.features: - if feature.type == "CDS" and "gene" in feature.qualifiers: - gene = feature.qualifiers['gene'][0] - if gene in Lista: - if not os.path.exists(saida+"/"+gene): - os.makedirs(saida+"/"+gene) - with open(saida+"/"+gene+"/"+seq_record.id+".fasta", "w") as ofile: - ofile.write(">{0}\n{1}\n".format(seq_record.id, feature.location.extract(seq_record).seq)) +import sys +from Bio import SeqIO + + +def setup_logging(): + """Configure logging for the script.""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Extract nucleotide sequences from GenBank files.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python "take genes into nucleotides.py" --input-folder gbk_files + --genes-list genes.txt + --output-folder nucleotides + python "take genes into nucleotides.py" -i gbk_files -g genes.txt -o nuc_out + """ + ) + + parser.add_argument( + '--input-folder', '-i', + required=True, + metavar='DIR', + help='Input folder containing GenBank files' + ) + + parser.add_argument( + '--genes-list', '-g', + required=True, + metavar='FILE', + help='File containing gene names (one per line)' + ) + + parser.add_argument( + '--output-folder', '-o', + required=True, + metavar='DIR', + help='Output folder for nucleotide FASTA files' + ) + + return parser.parse_args() + + +def load_gene_list(genes_file): + """ + Load gene names from a file. + + Args: + genes_file (str): Path to the file containing gene names + + Returns: + list: List of gene names + + Raises: + FileNotFoundError: If the gene list file does not exist + IOError: If there is an error reading the file + """ + if not os.path.exists(genes_file): + raise FileNotFoundError(f"Gene list file not found: {genes_file}") + + try: + with open(genes_file, 'r') as f: + gene_list = [line.strip() for line in f if line.strip()] + logging.info(f"Loaded {len(gene_list)} genes from {genes_file}") + return gene_list + except IOError as e: + raise IOError(f"Error reading gene list file: {e}") + + +def validate_input_folder(input_folder): + """ + Validate that the input folder exists and contains GenBank files. + + Args: + input_folder (str): Path to the input folder + + Returns: + list: List of GenBank files in the folder + + Raises: + FileNotFoundError: If the folder does not exist + ValueError: If no GenBank files are found + """ + if not os.path.exists(input_folder): + raise FileNotFoundError(f"Input folder not found: {input_folder}") + + if not os.path.isdir(input_folder): + raise ValueError(f"Input path is not a directory: {input_folder}") + + files = [f for f in os.listdir(input_folder) + if f.endswith(('.gbk', '.gb', '.genbank'))] + + if not files: + raise ValueError(f"No GenBank files found in {input_folder}") + + logging.info(f"Found {len(files)} GenBank file(s) in {input_folder}") + return files + + +def process_genbank_files(input_folder, output_folder, gene_list, input_files): + """ + Process all GenBank files and extract nucleotide sequences. + + Args: + input_folder (str): Input folder path + output_folder (str): Output folder path + gene_list (list): List of gene names to extract + input_files (list): List of GenBank files to process + """ + processed_count = 0 + error_count = 0 + extracted_count = 0 + + for idx, input_file in enumerate(input_files, 1): + input_path = os.path.join(input_folder, input_file) + + try: + logging.info(f"Processing file {idx}/{len(input_files)}: {input_file}") + + for seq_record in SeqIO.parse(input_path, 'genbank'): + for feature in seq_record.features: + if feature.type == "CDS" and "gene" in feature.qualifiers: + gene = feature.qualifiers['gene'][0] + if gene in gene_list: + # Extract nucleotide sequence + try: + nuc_seq = feature.location.extract(seq_record).seq + except Exception as e: + logging.warning( + f"Could not extract sequence for gene {gene} " + f"in {input_file}: {e}" + ) + continue + + # Create gene-specific output folder + gene_folder = os.path.join(output_folder, gene) + if not os.path.exists(gene_folder): + os.makedirs(gene_folder) + logging.debug(f"Created folder: {gene_folder}") + + # Write nucleotide sequence + output_file = os.path.join( + gene_folder, + f"{seq_record.id}.fasta" + ) + with open(output_file, "w") as ofile: + ofile.write(f">{seq_record.id}\n{nuc_seq}\n") + + extracted_count += 1 + + processed_count += 1 + + except Exception as e: + error_count += 1 + logging.error(f"Error processing {input_file}: {e}") + + logging.info( + f"Processing complete: {processed_count} files succeeded, " + f"{error_count} files failed" + ) + logging.info(f"Extracted {extracted_count} nucleotide sequences") + + +def main(): + """Main function to orchestrate the nucleotide extraction process.""" + setup_logging() + + try: + # Parse command line arguments + args = parse_arguments() + + logging.info("Gene to Nucleotide Extractor started") + logging.info(f"Input folder: {args.input_folder}") + logging.info(f"Genes list: {args.genes_list}") + logging.info(f"Output folder: {args.output_folder}") + + # Load gene list + gene_list = load_gene_list(args.genes_list) + + # Validate input folder and get list of files + input_files = validate_input_folder(args.input_folder) + + # Create output folder if it doesn't exist + if not os.path.exists(args.output_folder): + os.makedirs(args.output_folder) + logging.info(f"Created output folder: {args.output_folder}") + + # Process all GenBank files + process_genbank_files( + args.input_folder, + args.output_folder, + gene_list, + input_files + ) + + logging.info("Gene to Nucleotide Extractor finished successfully") + return 0 + + except Exception as e: + logging.error(f"Fatal error: {e}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..19a85b2 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,121 @@ +# Test Suite for Bioinfo Python Scripts + +This folder contains test data and test scripts to verify the functionality of the Python scripts in this repository. + +## Structure + +``` +tests/ +├── test_data/ +│ ├── input/ # Test GenBank files +│ │ ├── test_genome1.gbk +│ │ └── test_genome2.gbk +│ └── genes.txt # Test gene list +├── test_output/ # Output from test runs (generated) +├── test_scripts.py # Main test suite +└── README.md # This file +``` + +## Test Data + +The test data includes: +- **test_genome1.gbk**: Contains genes dnaA, rpoB, and a test gene +- **test_genome2.gbk**: Contains genes recA and dnaK +- **genes.txt**: List of genes to filter/extract (dnaA, dnaK, rpoB, recA) + +## Running Tests + +### Run All Tests + +To run the complete test suite: + +```bash +python tests/test_scripts.py +``` + +This will test: +1. `cdsselector.py` - Filters GenBank files by gene list +2. `take genes into aminoacid.py` - Extracts amino acid sequences +3. `take genes into nucleotides.py` - Extracts nucleotide sequences + +### Manual Testing + +You can also test individual scripts manually: + +**Test cdsselector.py:** +```bash +python cdsselector.py --input-folder tests/test_data/input \ + --genes-list tests/test_data/genes.txt \ + --output-folder tests/test_output/cdsselector +``` + +**Test take genes into aminoacid.py:** +```bash +python "take genes into aminoacid.py" --input-folder tests/test_data/input \ + --genes-list tests/test_data/genes.txt \ + --output-folder tests/test_output/aminoacid +``` + +**Test take genes into nucleotides.py:** +```bash +python "take genes into nucleotides.py" --input-folder tests/test_data/input \ + --genes-list tests/test_data/genes.txt \ + --output-folder tests/test_output/nucleotide +``` + +## Expected Results + +### cdsselector.py +- Creates filtered GenBank files with only the genes in the list +- Output files: `test_genome1.gbk`, `test_genome2.gbk` +- Each file contains only the genes specified in genes.txt + +### take genes into aminoacid.py +- Creates folders for each gene +- Each folder contains FASTA files with amino acid sequences +- Expected output structure: + ``` + aminoacid/ + ├── dnaA/ + │ └── TEST001.fasta + ├── rpoB/ + │ └── TEST001.fasta + ├── recA/ + │ └── TEST002.fasta + └── dnaK/ + └── TEST002.fasta + ``` + +### take genes into nucleotides.py +- Creates folders for each gene +- Each folder contains FASTA files with nucleotide sequences +- Expected output structure: + ``` + nucleotide/ + ├── dnaA/ + │ └── TEST001.fasta + ├── rpoB/ + │ └── TEST001.fasta + ├── recA/ + │ └── TEST002.fasta + └── dnaK/ + └── TEST002.fasta + ``` + +## Adding New Tests + +To add new test cases: + +1. Add new GenBank files to `test_data/input/` +2. Update `genes.txt` if testing new genes +3. Modify `test_scripts.py` to include new test cases + +## Requirements + +- Python 3.6+ +- Biopython + +Install requirements: +```bash +pip install biopython +``` diff --git a/tests/test_data/genes.txt b/tests/test_data/genes.txt new file mode 100644 index 0000000..8b5ba30 --- /dev/null +++ b/tests/test_data/genes.txt @@ -0,0 +1,4 @@ +dnaA +dnaK +rpoB +recA diff --git a/tests/test_data/input/test_genome1.gbk b/tests/test_data/input/test_genome1.gbk new file mode 100644 index 0000000..884b881 --- /dev/null +++ b/tests/test_data/input/test_genome1.gbk @@ -0,0 +1,38 @@ +LOCUS TEST_GENOME1 300 bp DNA linear BCT 01-JAN-2024 +DEFINITION Test genome 1 for script testing. +ACCESSION TEST001 +VERSION TEST001.1 +KEYWORDS . +SOURCE Test organism 1 + ORGANISM Test organism 1 + Bacteria. +FEATURES Location/Qualifiers + source 1..300 + /organism="Test organism 1" + /mol_type="genomic DNA" + gene 10..72 + /gene="dnaA" + CDS 10..72 + /gene="dnaA" + /product="chromosomal replication initiator protein DnaA" + /translation="MKLVRVLSTAAAA" + gene 100..162 + /gene="rpoB" + CDS 100..162 + /gene="rpoB" + /product="DNA-directed RNA polymerase subunit beta" + /translation="MKLIVKASTGPAT" + gene 200..262 + /gene="test" + CDS 200..262 + /gene="test" + /product="test protein" + /translation="MTEST*PROTEIN" +ORIGIN + 1 atgaaacttg tacgcgtact ttcaacagca gctgctgctt agctagctag ctagctagct + 61 agctagctag ctagctagct agctagctag ctagctagat gaagttgatt gtaaaagcat + 121 caactggtcc tgcgacttag ctagctagct agctagctag ctagctagct agctagctag + 181 ctagctagct agctagctaa tgactgaaag tacaacccat ccgcgaacag ctagctagct + 241 agctagctag ctagctagct agctagctag ctagctagct agctagctag ctagctagct + 301 +// diff --git a/tests/test_data/input/test_genome2.gbk b/tests/test_data/input/test_genome2.gbk new file mode 100644 index 0000000..d5c05ce --- /dev/null +++ b/tests/test_data/input/test_genome2.gbk @@ -0,0 +1,32 @@ +LOCUS TEST_GENOME2 300 bp DNA linear BCT 01-JAN-2024 +DEFINITION Test genome 2 for script testing. +ACCESSION TEST002 +VERSION TEST002.1 +KEYWORDS . +SOURCE Test organism 2 + ORGANISM Test organism 2 + Bacteria. +FEATURES Location/Qualifiers + source 1..300 + /organism="Test organism 2" + /mol_type="genomic DNA" + gene 10..72 + /gene="recA" + CDS 10..72 + /gene="recA" + /product="recombinase A" + /translation="MKRECOMBINAS" + gene 100..162 + /gene="dnaK" + CDS 100..162 + /gene="dnaK" + /product="molecular chaperone DnaK" + /translation="MKCHAPERONEA" +ORIGIN + 1 atgaaacgcg aacgaattca aatgtcgtga ttaacgctta gctagctagc tagctagcta + 61 gctagctagc tagctagcta gctagctagc tagctagcta gatgaaatgc catgccgagc + 121 gaaatgctga agctagctag ctagctagct agctagctag ctagctagct agctagctag + 181 ctagctagct agctagctat gctagctagt cgttcgattc gattcgatag ctagctagct + 241 agctagctag ctagctagct agctagctag ctagctagct agctagctag ctagctagct + 301 +// diff --git a/tests/test_scripts.py b/tests/test_scripts.py new file mode 100644 index 0000000..7b229d5 --- /dev/null +++ b/tests/test_scripts.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Test Suite for Bioinfo Python Scripts + +This script tests the functionality of cdsselector.py, +take genes into aminoacid.py, and take genes into nucleotides.py. + +Usage: + python tests/test_scripts.py +""" + +import os +import sys +import subprocess +import shutil +from pathlib import Path + + +class Colors: + """ANSI color codes for terminal output.""" + GREEN = '\033[92m' + RED = '\033[91m' + YELLOW = '\033[93m' + BLUE = '\033[94m' + BOLD = '\033[1m' + END = '\033[0m' + + +def print_header(text): + """Print a formatted header.""" + print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*70}{Colors.END}") + print(f"{Colors.BOLD}{Colors.BLUE}{text:^70}{Colors.END}") + print(f"{Colors.BOLD}{Colors.BLUE}{'='*70}{Colors.END}\n") + + +def print_success(text): + """Print success message.""" + print(f"{Colors.GREEN}✓ {text}{Colors.END}") + + +def print_error(text): + """Print error message.""" + print(f"{Colors.RED}✗ {text}{Colors.END}") + + +def print_info(text): + """Print info message.""" + print(f"{Colors.YELLOW}ℹ {text}{Colors.END}") + + +def run_command(cmd, description): + """ + Run a command and return success status. + + Args: + cmd (list): Command and arguments to run + description (str): Description of the test + + Returns: + bool: True if command succeeded, False otherwise + """ + print_info(f"Testing: {description}") + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30 + ) + if result.returncode == 0: + print_success(f"PASSED: {description}") + return True + else: + print_error(f"FAILED: {description}") + if result.stderr: + print(f" Error: {result.stderr[:200]}") + return False + except subprocess.TimeoutExpired: + print_error(f"TIMEOUT: {description}") + return False + except Exception as e: + print_error(f"ERROR: {description} - {e}") + return False + + +def verify_file_exists(filepath, description): + """Verify that a file exists.""" + if os.path.exists(filepath): + print_success(f"File exists: {description}") + return True + else: + print_error(f"File missing: {description}") + return False + + +def verify_folder_contents(folder, expected_count, description): + """Verify folder contains expected number of items.""" + if os.path.exists(folder): + count = len([f for f in os.listdir(folder) + if os.path.isfile(os.path.join(folder, f))]) + if count >= expected_count: + print_success(f"{description}: Found {count} files (expected >= {expected_count})") + return True + else: + print_error(f"{description}: Found {count} files (expected >= {expected_count})") + return False + else: + print_error(f"{description}: Folder does not exist") + return False + + +def test_cdsselector(): + """Test cdsselector.py script.""" + print_header("Testing cdsselector.py") + + script_dir = Path(__file__).parent.parent + test_data_dir = script_dir / "tests" / "test_data" + output_dir = script_dir / "tests" / "test_output" / "cdsselector" + + # Clean output directory + if output_dir.exists(): + shutil.rmtree(output_dir) + + tests_passed = 0 + tests_total = 0 + + # Test 1: Help command + tests_total += 1 + if run_command( + ["python", str(script_dir / "cdsselector.py"), "--help"], + "cdsselector.py --help" + ): + tests_passed += 1 + + # Test 2: Run with test data + tests_total += 1 + if run_command( + [ + "python", str(script_dir / "cdsselector.py"), + "--input-folder", str(test_data_dir / "input"), + "--genes-list", str(test_data_dir / "genes.txt"), + "--output-folder", str(output_dir) + ], + "cdsselector.py with test data" + ): + tests_passed += 1 + + # Test 3: Verify output files + tests_total += 1 + if verify_file_exists( + output_dir / "test_genome1.gbk", + "test_genome1.gbk output" + ): + tests_passed += 1 + + print(f"\n{Colors.BOLD}cdsselector.py: {tests_passed}/{tests_total} tests passed{Colors.END}") + return tests_passed, tests_total + + +def test_aminoacid_extractor(): + """Test take genes into aminoacid.py script.""" + print_header("Testing take genes into aminoacid.py") + + script_dir = Path(__file__).parent.parent + test_data_dir = script_dir / "tests" / "test_data" + output_dir = script_dir / "tests" / "test_output" / "aminoacid" + + # Clean output directory + if output_dir.exists(): + shutil.rmtree(output_dir) + + tests_passed = 0 + tests_total = 0 + + # Test 1: Help command + tests_total += 1 + if run_command( + ["python", str(script_dir / "take genes into aminoacid.py"), "--help"], + "take genes into aminoacid.py --help" + ): + tests_passed += 1 + + # Test 2: Run with test data + tests_total += 1 + if run_command( + [ + "python", str(script_dir / "take genes into aminoacid.py"), + "--input-folder", str(test_data_dir / "input"), + "--genes-list", str(test_data_dir / "genes.txt"), + "--output-folder", str(output_dir) + ], + "take genes into aminoacid.py with test data" + ): + tests_passed += 1 + + # Test 3: Verify gene folders exist + tests_total += 1 + dnaA_folder = output_dir / "dnaA" + if verify_file_exists(dnaA_folder, "dnaA gene folder"): + tests_passed += 1 + + # Test 4: Verify FASTA file exists + tests_total += 1 + fasta_files = list(dnaA_folder.glob("*.fasta")) + if len(fasta_files) > 0: + print_success(f"Found FASTA file(s) in dnaA folder: {[f.name for f in fasta_files]}") + tests_passed += 1 + else: + print_error("No FASTA files found in dnaA folder") + + print(f"\n{Colors.BOLD}take genes into aminoacid.py: {tests_passed}/{tests_total} tests passed{Colors.END}") + return tests_passed, tests_total + + +def test_nucleotide_extractor(): + """Test take genes into nucleotides.py script.""" + print_header("Testing take genes into nucleotides.py") + + script_dir = Path(__file__).parent.parent + test_data_dir = script_dir / "tests" / "test_data" + output_dir = script_dir / "tests" / "test_output" / "nucleotide" + + # Clean output directory + if output_dir.exists(): + shutil.rmtree(output_dir) + + tests_passed = 0 + tests_total = 0 + + # Test 1: Help command + tests_total += 1 + if run_command( + ["python", str(script_dir / "take genes into nucleotides.py"), "--help"], + "take genes into nucleotides.py --help" + ): + tests_passed += 1 + + # Test 2: Run with test data + tests_total += 1 + if run_command( + [ + "python", str(script_dir / "take genes into nucleotides.py"), + "--input-folder", str(test_data_dir / "input"), + "--genes-list", str(test_data_dir / "genes.txt"), + "--output-folder", str(output_dir) + ], + "take genes into nucleotides.py with test data" + ): + tests_passed += 1 + + # Test 3: Verify gene folders exist + tests_total += 1 + recA_folder = output_dir / "recA" + if verify_file_exists(recA_folder, "recA gene folder"): + tests_passed += 1 + + # Test 4: Verify FASTA file exists + tests_total += 1 + fasta_files = list(recA_folder.glob("*.fasta")) + if len(fasta_files) > 0: + print_success(f"Found FASTA file(s) in recA folder: {[f.name for f in fasta_files]}") + tests_passed += 1 + else: + print_error("No FASTA files found in recA folder") + + print(f"\n{Colors.BOLD}take genes into nucleotides.py: {tests_passed}/{tests_total} tests passed{Colors.END}") + return tests_passed, tests_total + + +def main(): + """Run all tests.""" + print_header("Bioinfo Python Scripts Test Suite") + + total_passed = 0 + total_tests = 0 + + # Run all tests + passed, total = test_cdsselector() + total_passed += passed + total_tests += total + + passed, total = test_aminoacid_extractor() + total_passed += passed + total_tests += total + + passed, total = test_nucleotide_extractor() + total_passed += passed + total_tests += total + + # Print final summary + print_header("Test Summary") + + success_rate = (total_passed / total_tests * 100) if total_tests > 0 else 0 + + if total_passed == total_tests: + print(f"{Colors.GREEN}{Colors.BOLD}ALL TESTS PASSED!{Colors.END}") + else: + print(f"{Colors.YELLOW}{Colors.BOLD}SOME TESTS FAILED{Colors.END}") + + print(f"\n{Colors.BOLD}Total: {total_passed}/{total_tests} tests passed ({success_rate:.1f}%){Colors.END}\n") + + # Return exit code + return 0 if total_passed == total_tests else 1 + + +if __name__ == "__main__": + sys.exit(main())