In [4]:
import pandas as pd
import subprocess
import os
import sys
import shutil
from Bio import SeqIO

# --- Configuration ---
INPUT_CSV = 'thermoproteota_data.csv'
OUTPUT_DIR = 'fastpart_results'
TRAIN_OUTPUT_CSV = 'thermoproteota_train.csv'
TEST_OUTPUT_CSV = 'thermoproteota_test.csv'

# Parameters (Must match what you want to run)
IDENTITY_THRESHOLD = 0.3  
TRAIN_RATIO = 0.8         
METHOD = 'mmseq'          

def step_1_prepare_input(csv_path, fasta_path):
    print(f"[1/4] Reading and preparing {csv_path}...")
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"Error reading CSV: {e}")
        sys.exit(1)

    # Deduplicate to ensure clean processing
    df_clean = df.drop_duplicates(subset=['Entry']).copy()
    print(f"      Unique sequences to process: {len(df_clean)}")

    with open(fasta_path, 'w') as f_out:
        for _, row in df_clean.iterrows():
            seq_id = str(row['Entry']).strip().replace('|', '_')
            label = "global" 
            sequence = str(row['Sequence']).strip()
            f_out.write(f">{seq_id}|{label}\n{sequence}\n")
    
    return df_clean

def step_2_run_buggy_tool(fasta_path, out_dir):
    print(f"\n[2/4] Running fast_part.py (Output Train.fasta will break, this is expected)...")
    
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)

    # We assume fast_part.py is in the current directory
    cmd = [
        "python", "fast_part.py",
        "--fasta_file", fasta_path,
        "--output_dir", out_dir,
        "--method", METHOD,
        "--identity_threshold", str(IDENTITY_THRESHOLD),
        "--train_ratio", str(TRAIN_RATIO)
    ]

    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError:
        print("Error running fast_part.py")
        sys.exit(1)

def step_3_reconstruct_data(original_df, out_dir):
    print(f"\n[3/4] Reconstructing lost Training Data...")

    # 1. Get Test IDs (These are safe)
    test_fasta = os.path.join(out_dir, "Test.fasta")
    if not os.path.exists(test_fasta):
        print("CRITICAL: Test.fasta not found. The tool failed completely.")
        sys.exit(1)
        
    test_ids = set()
    for record in SeqIO.parse(test_fasta, "fasta"):
        test_ids.add(record.id.split('|')[0])
    
    # 2. Get Removed IDs (From Summary Log)
    summary_file = os.path.join(out_dir, "summary.txt")
    removed_ids = set()
    if os.path.exists(summary_file):
        with open(summary_file, 'r') as f:
            is_reading_ids = False
            for line in f:
                line = line.strip()
                if line == "Removed sequence IDs:":
                    is_reading_ids = True
                    continue
                if is_reading_ids and line:
                    # IDs in summary are just the ID (e.g., A0A510E346)
                    removed_ids.add(line)
    
    print(f"      Test Set Size:     {len(test_ids)}")
    print(f"      Removed Sequences: {len(removed_ids)}")

    # 3. Calculate Train IDs
    # Train = All - Test - Removed
    all_ids = set(original_df['Entry'].astype(str))
    
    # Filter valid Train IDs
    train_ids = all_ids - test_ids - removed_ids
    
    print(f"      Recovered Train Size: {len(train_ids)}")

    if len(train_ids) < 100:
        print("WARNING: Recovered Train set is still very small. Check threshold settings.")

    # 4. Create DataFrames
    test_df = original_df[original_df['Entry'].isin(test_ids)]
    train_df = original_df[original_df['Entry'].isin(train_ids)]
    
    return train_df, test_df

def step_4_save_outputs(train_df, test_df):
    print(f"\n[4/4] Saving final corrected CSVs...")
    train_df.to_csv(TRAIN_OUTPUT_CSV, index=False)
    test_df.to_csv(TEST_OUTPUT_CSV, index=False)
    print(f"      Success! Saved {TRAIN_OUTPUT_CSV} ({len(train_df)} rows)")
    print(f"      Success! Saved {TEST_OUTPUT_CSV} ({len(test_df)} rows)")

if __name__ == "__main__":
    if shutil.which("mmseqs") is None:
        print("Error: mmseqs not found in PATH.")
        sys.exit(1)

    if os.path.exists(INPUT_CSV):
        # Temp fasta for input
        temp_input = "temp_input_for_tool.fasta"
        
        # 1. Prepare
        df_clean = step_1_prepare_input(INPUT_CSV, temp_input)
        
        # 2. Run Tool (Let it fail on the file write)
        step_2_run_buggy_tool(temp_input, OUTPUT_DIR)
        
        # 3. Recover Data
        train_df, test_df = step_3_reconstruct_data(df_clean, OUTPUT_DIR)
        
        # 4. Save
        step_4_save_outputs(train_df, test_df)
        
        # Cleanup
        if os.path.exists(temp_input):
            os.remove(temp_input)
    else:
        print(f"Error: {INPUT_CSV} not found.")

[1/4] Reading and preparing thermoproteota_data.csv...
      Unique sequences to process: 180810

[2/4] Running fast_part.py (Output Train.fasta will break, this is expected)...
createdb fastpart_results/global/global.fasta fastpart_results/Temp/global_mmseq_db 

MMseqs Version:                    	18.8cc5c
Database type                      	0
Shuffle input database             	true
Createdb mode                      	0
Write lookup file                  	1
Offset of numeric ids              	0
Threads                            	8
Compressed                         	0
Mask residues                      	0
Mask residues probability          	0.9
Mask lower case residues           	0
Mask lower letter repeating N times	0
Use GPU                            	0
Verbosity                          	3

Converting sequences
Time for merging to global_mmseq_db_h: 0h 0m 0s 21ms
Time for merging to global_mmseq_db: 0h 0m 0s 43ms
Database type: Aminoacid
Time for processing: 0h 0m 0s 316ms
clust

diamond v2.1.16.170 (C) Max Planck Society for the Advancement of Science, Benjamin J. Buchfink, University of Tuebingen
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)

#CPU threads: 8
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: fastpart_results/Temp/train.fasta
Opening the database file...  [0.001s]
Loading sequences...  [0.347s]
Masking sequences...  [0.292s]
Writing sequences...  [0.016s]
Hashing sequences...  [0.011s]
Loading sequences...  [0s]
Writing trailer...  [0s]
Closing the input file...  [0s]
Closing the database file...  [0s]

Database sequences  144649
  Database letters  39770421
     Database hash  4d4ba6b53a47d0b34357913da8683d0c
        Total time  0.669000s
diamond v2.1.16.170 (C) Max Planck Society for the Advancement of Science, Benjamin J. Buchfink, University of Tuebingen
Documentation, support and update

Updated combined train file is saved to fastpart_results/Train.fasta
Updated combined test file is saved to fastpart_results/Test.fasta
Summary file is saved to fastpart_results/summary.txt
Execution time: 49.90 seconds

[3/4] Reconstructing lost Training Data...
      Test Set Size:     36161
      Removed Sequences: 11168
      Recovered Train Size: 133486

[4/4] Saving final corrected CSVs...
      Success! Saved thermoproteota_train.csv (133486 rows)
      Success! Saved thermoproteota_test.csv (36161 rows)
