Install and Import Necessary Libraries

In [None]:
# Environment setup for Colab/notebook environment
!pip install -q condacolab
import condacolab
condacolab.install() # Installs anaci via conda below

!conda install -c bioconda anarci --yes

# Install specific package versions (consider if these are strictly necessary or if latest compatible versions are okay)
!pip install keras==2.11.0 tensorflow==2.11.0 scikit-learn==1.0.2 pandas numpy biopython

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import os
import subprocess
from pathlib import Path
import shutil
import sys

# Import machine learning libraries (Keras is part of TensorFlow now)
from tensorflow.keras.models import model_from_json 

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Attempt to import utils from scripts directory
# This requires the notebook to be in a directory where 'scripts' is a sibling or accessible via sys.path
# Assuming the notebook is in 'notebooks' and 'scripts' is a sibling directory
module_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Appended to sys.path: {module_path}")
else:
    print(f"{module_path} already in sys.path")

try:
    from utils import (
        one_hot_encoder,
        parse_anarci_results_to_aligned_sequences,
        predict_properties,
        get_project_root,
        dataframe_to_fasta_string
    )
    print("Successfully imported from utils.py")
except ImportError as e:
    print(f"Error importing from utils.py: {e}")
    print(f"Current working directory: {os.getcwd()}")
    print(f"Sys.path: {sys.path}")
    print("Please ensure utils.py is in the ../scripts directory relative to the notebook, or adjust sys.path.")
    # Define fallback functions if utils.py is not found (for basic execution, not ideal)
    def get_project_root(): return Path('.').resolve().parent # Basic fallback
    raise e

Define Project Paths and Parameters

In [None]:
PROJECT_ROOT = get_project_root() 
# For notebooks, utils.py is in ../scripts. So get_project_root() from utils.py will correctly point to the parent of scripts.
DATA_DIR = PROJECT_ROOT / "data"
INPUT_CSV_PATH = DATA_DIR / "input" / "DeepSP_input.csv"
OUTPUT_CSV_PATH = DATA_DIR / "DeepSP_descriptors_notebook.csv" # Different name to avoid conflict

print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Directory: {DATA_DIR}")
print(f"Input CSV: {INPUT_CSV_PATH}")
print(f"Output CSV: {OUTPUT_CSV_PATH}")

# Temporary directory for ANARCI files within the notebook's execution environment
TEMP_ANARCI_DIR = Path('.') / "temp_anarci_notebook_processing" # Create in current notebook directory
TEMP_ANARCI_DIR.mkdir(exist_ok=True)

HEAVY_FASTA_PATH = TEMP_ANARCI_DIR / "seq_H.fasta"
LIGHT_FASTA_PATH = TEMP_ANARCI_DIR / "seq_L.fasta"
ANARCI_H_CSV_PATH = TEMP_ANARCI_DIR / "seq_aligned_H.csv"
ANARCI_L_CSV_PATH = TEMP_ANARCI_DIR / "seq_aligned_KL.csv"

Import dataset

In [None]:
try:
    dataset = pd.read_csv(INPUT_CSV_PATH)
    print(f"Successfully loaded dataset from {INPUT_CSV_PATH}")
    display(dataset.head()) # In Jupyter, use display() for rich DataFrame output
except FileNotFoundError:
    print(f"ERROR: Input CSV file not found at {INPUT_CSV_PATH}")
    # Stop execution or handle error appropriately for a notebook
    raise
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
    raise

Convert DataFrame to Fasta Files for ANARCI

In [None]:
with open(HEAVY_FASTA_PATH, "w") as h_out, open(LIGHT_FASTA_PATH, "w") as l_out:
    for _, row in dataset.iterrows():
        h_out.write(f">{row['Name']}\n{row['Heavy_Chain']}\n")
        l_out.write(f">{row['Name']}\n{row['Light_Chain']}\n")
print(f"Created FASTA files: {HEAVY_FASTA_PATH}, {LIGHT_FASTA_PATH}")

Sequence Alignment with ANARCI

In [None]:
# Using subprocess for ANARCI calls, similar to the refactored deepsp_predictor.py
# The -o for ANARCI specifies a prefix, ANARCI adds _H.csv or _KL.csv itself.
anarci_output_prefix_H = TEMP_ANARCI_DIR / "seq_aligned"
anarci_output_prefix_L = TEMP_ANARCI_DIR / "seq_aligned"

anarci_cmd_H = [
    "ANARCI", "-i", str(HEAVY_FASTA_PATH), "-o", str(anarci_output_prefix_H),
    "-s", "imgt", "-r", "heavy", "--csv"
]
anarci_cmd_L = [
    "ANARCI", "-i", str(LIGHT_FASTA_PATH), "-o", str(anarci_output_prefix_L),
    "-s", "imgt", "-r", "light", "--csv"
]

print(f"Running ANARCI for Heavy Chains: {' '.join(anarci_cmd_H)}")
process_H = subprocess.run(anarci_cmd_H, capture_output=True, text=True, check=False)
if process_H.returncode != 0:
    print(f"ANARCI Error (Heavy Chains):\n{process_H.stderr}")
else:
    print("ANARCI Heavy Chain processing successful.")
    # print(process_H.stdout) # For debugging

print(f"Running ANARCI for Light Chains: {' '.join(anarci_cmd_L)}")
process_L = subprocess.run(anarci_cmd_L, capture_output=True, text=True, check=False)
if process_L.returncode != 0:
    print(f"ANARCI Error (Light Chains):\n{process_L.stderr}")
else:
    print("ANARCI Light Chain processing successful.")
    # print(process_L.stdout) # For debugging

# Verify output files
if not ANARCI_H_CSV_PATH.exists() or not ANARCI_L_CSV_PATH.exists():
    print(f"Error: ANARCI did not produce the expected output CSV files at {TEMP_ANARCI_DIR}")
    print(f"Checked for: {ANARCI_H_CSV_PATH}, {ANARCI_L_CSV_PATH}")
    # ANARCI might create files in the current dir if -o path is problematic, check there.
    # This was a fallback in predictor, might be needed here too.
    if Path("seq_aligned_H.csv").exists() and Path("seq_aligned_KL.csv").exists():
        print("Found ANARCI output in current directory. Moving them.")
        shutil.move("seq_aligned_H.csv", ANARCI_H_CSV_PATH)
        shutil.move("seq_aligned_KL.csv", ANARCI_L_CSV_PATH)
    else:
        raise FileNotFoundError(f"ANARCI output files not found. Searched in {TEMP_ANARCI_DIR} and current directory.")
else:
    print(f"ANARCI output CSVs found: {ANARCI_H_CSV_PATH}, {ANARCI_L_CSV_PATH}")

Parse ANARCI Results and Align Sequences

In [None]:
# The seq_preprocessing function is now replaced by utils.parse_anarci_results_to_aligned_sequences
# The constants (H_inclusion_list, L_inclusion_list, H_dict, L_dict) are in utils.py
try:
    valid_name_list, aligned_seq_list = parse_anarci_results_to_aligned_sequences(
        str(ANARCI_H_CSV_PATH), str(ANARCI_L_CSV_PATH)
    )
    if not aligned_seq_list:
        print("Warning: No sequences were successfully aligned or parsed from ANARCI results.")
    else:
        print(f"Successfully parsed {len(aligned_seq_list)} aligned sequences.")
        # print("First few aligned sequences:", aligned_seq_list[:2]) # For debugging
except Exception as e:
    print(f"Error during ANARCI result parsing: {e}")
    raise

One Hot Encoding of Aligned Sequences

In [None]:
# The load_input_data function is no longer needed as we have aligned_seq_list
if not aligned_seq_list: # Check if list is empty
    print("Aligned sequence list is empty. Skipping one-hot encoding.")
    X_processed = np.array([])
else:
    X_one_hot = [one_hot_encoder(s=x) for x in aligned_seq_list] # one_hot_encoder from utils.py
    X_processed = np.transpose(np.asarray(X_one_hot), (0, 2, 1))
    X_processed = np.asarray(X_processed)
    print(f"Processed data shape for model input: {X_processed.shape}")

Predict DeepSP Descriptors

In [None]:
if X_processed.size == 0:
    print("Skipping prediction as there is no processed data.")
    df_predictions = pd.DataFrame()
else:
    # predict_properties function from utils.py handles loading models and predicting
    # It uses get_project_root() / "data" as the default model_base_path if None is passed
    df_predictions = predict_properties(X_processed, model_base_path=str(DATA_DIR))
    print("Predictions complete.")
    if not df_predictions.empty:
        display(df_predictions.head())
    else:
        print("Prediction resulted in an empty DataFrame.")

Save Results to CSV

In [None]:
if not df_predictions.empty and valid_name_list:
    df_name = pd.DataFrame(valid_name_list, columns=["Name"]) # Use names of successfully processed sequences
    df_final = pd.concat([df_name.reset_index(drop=True), df_predictions.reset_index(drop=True)], axis=1)
    
    df_final.to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"Successfully saved DeepSP descriptors to {OUTPUT_CSV_PATH}")
    display(df_final.head())
else:
    print("No predictions to save, or name list is empty.")

Clean up Temporary Files

In [None]:
if TEMP_ANARCI_DIR.exists():
    try:
        shutil.rmtree(TEMP_ANARCI_DIR)
        print(f"Successfully removed temporary directory: {TEMP_ANARCI_DIR}")
    except Exception as e:
        print(f"Error removing temporary directory {TEMP_ANARCI_DIR}: {e}")