In [None]:
import pandas as pd
import io
import os
import subprocess
import sys
import requests
import json

# --- MMseqs2 Executable Path ---
# This variable stores the name of the MMseqs2 executable.
# It's assumed to be available in the system's PATH after Conda installation.
MMSEQS_EXEC = "mmseqs"

# --- MMseqs2 Installation via Conda ---
# This block handles the installation of Miniconda and MMseqs2 via Bioconda.
# It also installs necessary Python libraries like pandas and requests.
# This part is crucial for setting up the environment, especially in environments like Colab.
print("Setting up Conda environment. This may take a few moments...\n")
try:
    # Download Miniconda installer
    subprocess.run(["wget", "-q", "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh", "-O", "miniconda.sh"], check=True)
    # Install Miniconda silently to /usr/local
    subprocess.run(["bash", "miniconda.sh", "-b", "-p", "/usr/local", "-f"], check=True)
    # Update PATH environment variable to include Conda binaries
    os.environ['PATH'] = "/usr/local/bin:/usr/local/condabin:" + os.environ['PATH']
    # Add Conda's Python site-packages to sys.path
    sys.path.append('/usr/local/lib/python' + sys.version[:3] + '/site-packages')
    print("Miniconda installed successfully.\n")

    print("Installing MMseqs2 via Bioconda. This is a large package, please be patient...\n")
    # Install MMseqs2 from conda-forge and bioconda channels
    subprocess.run(["conda", "install", "-c", "conda-forge", "-c", "bioconda", "mmseqs2", "-y"], check=True)
    print("MMseqs2 installed successfully.\n")

    print("Installing necessary Python libraries: pandas and requests...\n")
    # Install pandas and requests using pip
    subprocess.run(["pip", "install", "pandas", "requests"], check=True)
    print("Python libraries installed.\n")
    print("MMseqs2 setup complete.\n")
except subprocess.CalledProcessError as e:
    # Catch errors from subprocess commands
    print(f"An error occurred during MMseqs2 or Conda installation: {e}")
    print(f"Stderr: {e.stderr.decode()}")
    sys.exit("Exiting due to installation failure.")
except Exception as e:
    # Catch any other unexpected errors during setup
    print(f"An unexpected error occurred during setup: {e}")
    sys.exit("Exiting due to setup failure.")

print("\nVerifying MMseqs2 installation:\n")
try:
    # Verify MMseqs2 installation by checking its version
    mmseqs_version_output = subprocess.run([MMSEQS_EXEC, "--version"], capture_output=True, text=True, check=True)
    print("MMseqs2 version check successful:")
    print(mmseqs_version_output.stdout)
except FileNotFoundError:
    print("Error: MMseqs2 executable was not found. Installation might have failed or PATH is not correctly set.\n")
except subprocess.CalledProcessError as e:
    print(f"Error checking MMseqs2 version: {e.stderr}\n")

# --- Database Download & Filtering ---
# Configuration for downloading FASTA databases from a GitHub repository.
GITHUB_REPO_OWNER = "MuthusaravananS"
GITHUB_REPO_NAME = "virulence-factor_DFVF"
GITHUB_API_CONTENTS_URL = f"https://api.github.com/repos/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}/contents/"
GITHUB_RAW_URL = f"https://raw.githubusercontent.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}/main/"

TARGET_DATABASES = [] # Stores (display_name, file_path) tuples of prepared databases

def get_fasta_filenames_from_github(api_url):
    """
    Fetches a list of FASTA filenames from a specified GitHub repository via its API.
    """
    print(f"Attempting to fetch file list from GitHub repository: {api_url}...\n")
    fasta_files = []
    try:
        response = requests.get(api_url)
        response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
        contents = response.json()
        for item in contents:
            # Check if the item is a file and has a common FASTA extension
            if item['type'] == 'file' and any(item['name'].endswith(ext) for ext in ['.fasta', '.fa', '.fna', '.faa']):
                fasta_files.append(item['name'])
        print(f"Found {len(fasta_files)} FASTA files in the repository.\n")
    except requests.exceptions.RequestException as e:
        print(f"Error: Could not fetch file list from GitHub API. Please check your internet connection or the repository URL. Details: {e}\n")
    return fasta_files

def filter_fasta_by_length(input_fasta_path, output_fasta_path, min_length=5):
    """
    Filters sequences in a FASTA file, keeping only those with length >= min_length.
    """
    print(f"Filtering '{input_fasta_path}' for sequences with length greater than or equal to {min_length} amino acids...\n")
    sequences_kept = 0
    total_sequences = 0
    try:
        with open(input_fasta_path, 'r') as infile, open(output_fasta_path, 'w') as outfile:
            header = ''
            sequence = []
            for line in infile:
                line = line.strip()
                if not line: continue # Skip empty lines
                if line.startswith('>'):
                    # Process previous sequence if it exists
                    if header and sequence:
                        total_sequences += 1
                        seq_str = "".join(sequence).replace(" ", "")
                        if len(seq_str) >= min_length:
                            outfile.write(header + '\n')
                            outfile.write(seq_str + '\n')
                            sequences_kept += 1
                    header = line # Start new header
                    sequence = [] # Reset sequence
                else:
                    sequence.append(line)
            # Process the last sequence in the file after the loop finishes
            if header and sequence:
                total_sequences += 1
                seq_str = "".join(sequence).replace(" ", "")
                if len(seq_str) >= min_length:
                    outfile.write(header + '\n')
                    outfile.write(seq_str + '\n')
                    sequences_kept += 1
        print(f"Filtering complete for '{input_fasta_path}': {sequences_kept} of {total_sequences} sequences kept (minimum length {min_length} AA).\n")
    except FileNotFoundError:
        print(f"Error: Input FASTA file '{input_fasta_path}' not found for filtering. Skipping this file.\n")
    except Exception as e:
        print(f"An unexpected error occurred during FASTA filtering for '{input_fasta_path}': {e}\n")

# Get list of FASTA files from GitHub
FASTA_FILENAMES_TO_DOWNLOAD = get_fasta_filenames_from_github(GITHUB_API_CONTENTS_URL)

print("Beginning download and filtering of all identified target FASTA files from GitHub...\n")
for filename in FASTA_FILENAMES_TO_DOWNLOAD:
    raw_download_url = f"{GITHUB_RAW_URL}{filename}"
    local_raw_path = f"{filename}_raw" # Temporarily store raw downloaded file
    local_filtered_path = filename # Store filtered file with original name

    print(f"Downloading '{filename}' from {raw_download_url}...\n")
    try:
        response = requests.get(raw_download_url, stream=True)
        response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
        with open(local_raw_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Successfully downloaded '{filename}'.\n")
    except requests.exceptions.RequestException as e:
        print(f"Error: Failed to download '{filename}' from {raw_download_url}. Details: {e}\n")
        if hasattr(e, 'response') and e.response is not None:
            preview_text = e.response.text[:200].replace('\n', ' ')
            print(f"   HTTP Status: {e.response.status_code}\n")
            print(f"   Response Preview: {preview_text}...\n")
        if os.path.exists(local_raw_path): os.remove(local_raw_path) # Clean up partial download
        continue # Skip to next file if download fails

    if not os.path.exists(local_raw_path) or os.path.getsize(local_raw_path) == 0:
        print(f"Warning: Downloaded file '{local_raw_path}' is empty or missing content. Skipping filtering for this file.\n")
        if os.path.exists(local_raw_path): os.remove(local_raw_path)
        continue # Skip to next file if downloaded file is empty

    # Filter the downloaded FASTA file and save the filtered version
    filter_fasta_by_length(local_raw_path, local_filtered_path, min_length=5)
    if os.path.exists(local_raw_path): os.remove(local_raw_path) # Remove the raw, unfiltered file to save space

    # Add the successfully prepared database to the list
    if os.path.exists(local_filtered_path) and os.path.getsize(local_filtered_path) > 0:
        display_name = filename.replace("_", " ").replace(".fasta", "").replace(".fa", "").replace(".fna", "").replace(".faa", "")
        TARGET_DATABASES.append((display_name, local_filtered_path))
        print(f"Successfully prepared database '{display_name}' from `{local_filtered_path}`.\n")
    else:
        print(f"Error: The filtered database for '{filename}' is empty or could not be created. It will not be available for search.\n")

if not TARGET_DATABASES:
    print("Error: No valid target FASTA databases were prepared after download and filtering. The search operation cannot proceed.\n")
else:
    print("\nAll specified target FASTA files have been processed and are ready for MMseqs2 searches.\n")

# --- MMseqs2 Search Logic ---
def run_mmseqs_search(query_fasta_file_path: str, selected_target_db_path: str,
                      min_seq_id: float = 0.3, min_q_coverage: float = 0.0) -> tuple[pd.DataFrame, str | None]:
    """
    Runs an MMseqs2 search against a selected target database.
    Applies filtering based on sequence identity and query coverage.

    Args:
        query_fasta_file_path (str): Path to the user's query FASTA file.
        selected_target_db_path (str): Path to the prepared target database FASTA file.
        min_seq_id (float): Minimum sequence identity (0.0 to 1.0).
        min_q_coverage (float): Minimum query coverage (0.0 to 1.0).

    Returns:
        tuple[pd.DataFrame, str | None]: A pandas DataFrame containing the search results
                                         and the filename of the generated .m8 output file.
                                         Returns (empty DataFrame, None) if an error occurs or no results.
    """
    global MMSEQS_EXEC

    # Input validation
    if not query_fasta_file_path or not os.path.exists(query_fasta_file_path):
        print(f"Error: Query FASTA file not provided or not found at '{query_fasta_file_path}'. Please provide a valid path.\n")
        return pd.DataFrame({"Error": [f"Query FASTA file not provided or not found. Please provide a valid path.\n"]}), None
    if not selected_target_db_path or not os.path.exists(selected_target_db_path):
        print(f"Error: Target FASTA database not selected or not found at '{selected_target_db_path}'. Please check the database setup.\n")
        return pd.DataFrame({"Error": [f"Target FASTA database not selected or not found at '{selected_target_db_path}'. Please check setup.\n"]}), None

    # Define names for MMseqs2 intermediate files and the final output
    query_db_name = "queryDB_mmseqs"
    target_db_name = "targetDB_mmseqs"
    aln_res_name = "alnRes_mmseqs" # This is the intermediate MMseqs2 alignment results database
    tmp_dir = "mmseqs_tmp" # Temporary directory used by mmseqs search

    # Generate a clean basename for the .m8 output file based on the target database name
    db_basename = os.path.basename(selected_target_db_path)
    db_name_clean = ''.join(c if c.isalnum() else '_' for c in os.path.splitext(db_basename)[0])
    m8_output_name = f"results_{db_name_clean}.m8" # This is the final human-readable .m8 output file

    try:
        # Step 1: Create MMseqs2 database for the target sequences
        print(f"Running MMseqs2: Creating target database from {selected_target_db_path}...\n")
        # 'createdb' command converts FASTA to MMseqs2 internal database format
        subprocess.run([MMSEQS_EXEC, "createdb", selected_target_db_path, target_db_name], check=True, capture_output=True)

        # Step 2: Create MMseqs2 database for the query sequences
        print(f"Running MMseqs2: Creating query database from {query_fasta_file_path}...\n")
        subprocess.run([MMSEQS_EXEC, "createdb", query_fasta_file_path, query_db_name], check=True, capture_output=True)

        # Step 3: Perform the MMseqs2 search
        print(f"Running MMseqs2: Performing search with minimum identity of {min_seq_id}. This may take some time...\n", flush=True)
        # 'search' command performs the actual sequence search
        # '--min-seq-id': sets the minimum sequence identity threshold
        # '-a': output all alignments (not just best ones per query)
        subprocess.run([MMSEQS_EXEC, "search", query_db_name, target_db_name, aln_res_name, tmp_dir,
                        "--min-seq-id", str(min_seq_id), "-a"], check=True, capture_output=True)

        # Step 4: Convert MMseqs2 alignment results to the standard .m8 format
        print("Running MMseqs2: Converting alignment results to .m8 format for readability...\n")
        # Define the output format columns for the .m8 file (standard BLAST-like format)
        mmseqs_output_format = "query,target,fident,evalue,qlen,tlen,qstart,qend,tstart,tend,alnlen,raw,bits,cigar,qcov,tcov"
        subprocess.run([MMSEQS_EXEC, "convertalis", query_db_name, target_db_name, aln_res_name, m8_output_name,
                        "--format-output", mmseqs_output_format], check=True, capture_output=True)

        # Check if the .m8 output file was created and has content
        if not os.path.exists(m8_output_name) or os.path.getsize(m8_output_name) == 0:
            print("MMseqs2 search completed, but no hits were found matching your criteria, or the output file is empty.\n")
            return pd.DataFrame({"Message": ["No hits found matching your search criteria."]}, None), None

        # Load the results into a pandas DataFrame for easier manipulation
        mmseqs_cols = ["query_id", "target_id", "identity", "e_value", "query_len", "target_len",
                       "q_start", "q_end", "t_start", "t_end", "aln_len", "raw_score", "bit_score",
                       "cigar", "q_coverage", "t_coverage"]
        mmseqs_results_df = pd.read_csv(m8_output_name, sep='\t', header=None, names=mmseqs_cols)

        # Apply additional query coverage filter if specified by the user
        if min_q_coverage > 0:
            mmseqs_results_df = mmseqs_results_df[mmseqs_results_df['q_coverage'] >= min_q_coverage]
            print(f"Filtered results further by minimum query coverage: {min_q_coverage}.\n")

        # min_t_coverage is no longer a parameter, so no filtering here.

        print("MMseqs2 search and filtering complete.\n")
        return mmseqs_results_df, m8_output_name
    except subprocess.CalledProcessError as e:
        # Handle errors specifically from MMseqs2 commands
        error_msg = f"MMseqs2 command failed during execution. Please check the MMseqs2 output for details. Stderr: {e.stderr.decode()}\n"
        print(f"Operation failed: {error_msg}\n")
        return pd.DataFrame({"Error": [error_msg]}), None
    except Exception as e:
        # Handle any other unexpected errors during the search process
        error_msg = f"An unexpected error occurred during the search process: {e}\n"
        print(f"Operation failed: {error_msg}\n")
        return pd.DataFrame({"Error": [error_msg]}), None
    finally:
        # Clean up all MMseqs2 temporary database files and the temporary directory.
        # The final .m8 output file (m8_output_name) is explicitly kept.
        print("Cleaning up MMseqs2 intermediate files...\n")
        # List of prefixes for MMseqs2 generated files/directories to remove
        temp_prefixes_to_remove = [query_db_name, target_db_name, aln_res_name, tmp_dir]

        for f_prefix in temp_prefixes_to_remove:
            # Use 'rm -rf' with shell=True to delete directories and all associated files
            # MMseqs2 creates multiple files for each database (e.g., .index, .data, .dbtype)
            subprocess.run([f"rm -rf {f_prefix}*"], shell=True, capture_output=True)

        print("Temporary file cleanup complete. Your final .m8 results file is preserved.\n")

def save_results_to_csv(df_to_save: pd.DataFrame, filename: str):
    """
    Saves a pandas DataFrame to a CSV file.
    """
    if df_to_save.empty:
        print("No results to save to CSV as the DataFrame is empty.")
        return
    try:
        df_to_save.to_csv(filename, index=False)
        print(f"Search results saved to '{filename}'. You can download this file from the Colab file browser.\n")
    except Exception as e:
        print(f"Error saving results to CSV file '{filename}': {e}\n")

# --- Main Execution Block ---
if __name__ == "__main__":
    print("\n--- Welcome to the MMseqs2 Search Script ---\n")

    if not TARGET_DATABASES:
        print("Warning: No valid target FASTA databases were prepared. The search function will not be available.")
    else:
        print("\n--- Database setup is complete. Ready to proceed with your query. ---\n")
        print("First, please upload your query FASTA file to the environment.")
        print("If using Google Colab, you can do this by dragging and dropping your file into the 'Files' tab on the left sidebar.")

        # Prompt for query FASTA path with validation
        query_fasta_path = ""
        while True:
            query_fasta_path = input("Enter the path to your query FASTA file (e.g., 'my_queries.fasta'): ").strip()
            if not query_fasta_path:
                print("Error: File path cannot be empty. Please try again.")
            elif not os.path.exists(query_fasta_path):
                print(f"Error: File not found at '{query_fasta_path}'. Please check the path and ensure the file is uploaded.")
            elif not query_fasta_path.lower().endswith(('.fasta', '.fa', '.fna', '.faa')):
                print(f"Error: File '{query_fasta_path}' does not appear to be a FASTA file. Please ensure it has a common FASTA extension (.fasta, .fa, .fna, .faa).")
            else:
                break # Valid path entered

        print(f"Confirmed: Using query FASTA file: {query_fasta_path}\n")

        # --- Database Selection ---
        print("\n--- Available Target Databases ---")
        for i, (display_name, _) in enumerate(TARGET_DATABASES):
            print(f"{i + 1}. {display_name}")

        selected_db_index = -1
        while True:
            user_input = input(f"Enter the number corresponding to the database you want to search against (1-{len(TARGET_DATABASES)}): ").strip()
            try:
                index = int(user_input) - 1
                if 0 <= index < len(TARGET_DATABASES):
                    selected_db_index = index
                    break
                else:
                    print("Invalid number. Please enter a number from the list.")
            except ValueError:
                print("Invalid input. Please enter a number.")

        selected_db_path = TARGET_DATABASES[selected_db_index][1]
        selected_db_name = TARGET_DATABASES[selected_db_index][0]
        print(f"You have selected target database: '{selected_db_name}' located at '{selected_db_path}'.\n")

        print("\n--- Now, please enter your desired MMseqs2 search parameters ---")
        print("You can press Enter for any parameter to use its default value.\n")

        # Prompt for Minimum Sequence Identity
        min_identity = 0.3 # Default MMseqs2 min-seq-id
        while True:
            user_input = input(f"Enter Minimum Sequence Identity (a decimal from 0.0 to 1.0, default: {min_identity}): ").strip()
            if not user_input:
                break # Use default value
            try:
                value = float(user_input)
                if 0.0 <= value <= 1.0:
                    min_identity = value
                    break
                else:
                    print("Invalid input. The value must be between 0.0 and 1.0.")
            except ValueError:
                print("Invalid input. Please enter a numerical value (e.g., 0.5, 0.8).")

        # Prompt for Minimum Query Coverage (post-filtering)
        min_query_coverage = 0.0 # Default value means no filtering by query coverage
        while True:
            user_input = input(f"Enter Minimum Query Coverage (a decimal from 0.0 to 1.0, default: {min_query_coverage}): ").strip()
            if not user_input:
                break # Use default value
            try:
                value = float(user_input)
                if 0.0 <= value <= 1.0:
                    min_query_coverage = value
                    break
                else:
                    print("Invalid input. The value must be between 0.0 and 1.0.")
            except ValueError:
                print("Invalid input. Please enter a numerical value (e.g., 0.7, 0.9).")

        # Minimum Target Coverage is completely removed, no longer passed to the function.


        print(f"\nProceeding with MMseqs2 search using the following parameters:\n"
              f"   Minimum Sequence Identity: {min_identity}\n"
              f"   Minimum Query Coverage: {min_query_coverage}\n")

        # Run the MMseqs2 search with the collected parameters
        results_df, final_m8_filename = run_mmseqs_search(
            query_fasta_file_path=query_fasta_path,
            selected_target_db_path=selected_db_path,
            min_seq_id=min_identity,
            min_q_coverage=min_query_coverage
        )

        print("\n--- MMseqs2 Search Results ---\n")
        if not results_df.empty and final_m8_filename:
            # Display a preview of the results
            print("Displaying the top few rows of results (if many hits, only a portion will show here):\n")
            print(results_df.head(10).to_string()) # Print first 10 rows for brevity

            # Construct CSV filename from .m8 filename and save the full results
            csv_output_filename = final_m8_filename.replace(".m8", ".csv")
            print(f"\nFull results will be saved to '{csv_output_filename}'.\n")
            save_results_to_csv(results_df, csv_output_filename)
        else:
            print("No results found matching the specified criteria. No CSV file will be generated for results (only an empty DataFrame was returned).\n")

    print("\n--- MMseqs2 Search Script Finished ---\n")


Setting up Conda environment. This may take a few moments...

Miniconda installed successfully.

Installing MMseqs2 via Bioconda. This is a large package, please be patient...

MMseqs2 installed successfully.

Installing necessary Python libraries: pandas and requests...

Python libraries installed.

MMseqs2 setup complete.


Verifying MMseqs2 installation:

Error checking MMseqs2 version: 

Attempting to fetch file list from GitHub repository: https://api.github.com/repos/MuthusaravananS/virulence-factor_DFVF/contents/...

Found 7 FASTA files in the repository.

Beginning download and filtering of all identified target FASTA files from GitHub...

Downloading 'uniprotkb_virulence_factor_AND_taxonomy4751_2025_06_21.fasta' from https://raw.githubusercontent.com/MuthusaravananS/virulence-factor_DFVF/main/uniprotkb_virulence_factor_AND_taxonomy4751_2025_06_21.fasta...

Successfully downloaded 'uniprotkb_virulence_factor_AND_taxonomy4751_2025_06_21.fasta'.

Filtering 'uniprotkb_virulence_fa