# Homework 04:  Implementing Recurrent Neural Network for protein secondary structure prediction

##PART 1

In [None]:
"""
Protein Secondary Structure Prediction Data Preparation
"""

import numpy as np
import os
from sklearn.model_selection import train_test_split
# Using tensorflow's padding utility is convenient
# Ensure tensorflow is installed: pip install tensorflow
try:
    from tensorflow.keras.preprocessing.sequence import pad_sequences
except ImportError:
    print("---------------------------------------------------------")
    print("ERROR: TensorFlow not found.")
    print("This script uses TensorFlow/Keras for padding sequences.")
    print("Please install it: pip install tensorflow")
    print("Alternatively, you can modify the script to implement manual padding.")
    print("---------------------------------------------------------")
    exit() # Stop execution if TF is not available

# -----------------------------------------------------------------------------
# Constants and Mappings
# -----------------------------------------------------------------------------

# Define constants for amino acids and secondary structures
AMINO_ACIDS = 'ARNDCQEGHILKMFPSTWYV' # 20 standard amino acids
SECONDARY_STRUCTURES = 'CHE'        # C: Coil, H: Helix, E: Strand (Sheet)
UNKNOWN_AMINO_ACID = 'X'            # Character sometimes used for unknown/non-standard AAs

# Create mapping dictionaries from character to integer index
amino_acid_mapping = {amino_acid: i for i, amino_acid in enumerate(AMINO_ACIDS)}
secondary_structure_mapping = {secondary_structure: i for i, secondary_structure in enumerate(SECONDARY_STRUCTURES)}

NUM_AMINO_ACIDS = len(AMINO_ACIDS)
NUM_SECONDARY_STRUCTURES = len(SECONDARY_STRUCTURES)

print("--- Configuration ---")
print("Amino Acids Mapping: \n", amino_acid_mapping)
print("Number of Amino Acid Classes:", NUM_AMINO_ACIDS)
print("\nSecondary Structure Mapping: \n", secondary_structure_mapping)
print("Number of Secondary Structure Classes:", NUM_SECONDARY_STRUCTURES)
print("-" * 30)

# -----------------------------------------------------------------------------
# Requirement 3: Define a data loader function to parse the files
# -----------------------------------------------------------------------------
def data_loader(file_path):
    """
    Parses the protein data text file based on the observed format:
    - Skips the first line (metadata).
    - Expects blocks of: ID line, Sequence line, Structure line.
    - Handles blank lines between blocks.
    - Validates that sequence and structure lengths match for each pair.

    Args:
        file_path (str): Path to the data file.

    Returns:
        tuple: A tuple containing two lists:
               - sequences (list): List of amino acid sequence strings.
               - structures (list): List of corresponding secondary structure strings.
               Returns empty lists if the file cannot be read or is empty.
    """
    sequences, structures = [], []
    current_sequence = None
    # State: 0=Expecting ID (or blank), 1=Expecting Sequence, 2=Expecting Structure
    expected_line_type = 0

    print(f"Attempting to load data from: {file_path}")
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}.")
        return [], [] # Return empty lists if file doesn't exist

    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()

        if not lines:
            print("Warning: File is empty.")
            return [], []

        # --- Skip the first metadata line ---
        if len(lines) > 0:
            print(f"Skipping header line: '{lines[0].strip()}'")
        line_iterator = iter(lines[1:]) # Start processing from the second line

        for line_num, line in enumerate(line_iterator, start=2): # Line numbers relative to original file
            stripped_line = line.strip()

            if not stripped_line:
                # Ignore blank lines, reset expectation to ID if needed.
                if expected_line_type != 0:
                    expected_line_type = 0
                    current_sequence = None # Reset incomplete pair
                continue # Skip processing this blank line

            # --- Process non-blank lines based on expected type ---
            if expected_line_type == 0:
                # This should be the ID line. We don't store it, just advance state.
                expected_line_type = 1 # Now expect the sequence
            elif expected_line_type == 1:
                # This should be the sequence. Store it.
                current_sequence = stripped_line
                expected_line_type = 2 # Now expect the structure
            elif expected_line_type == 2:
                # This should be the structure. Store it and finalize the pair.
                current_structure = stripped_line

                # Validate and store the completed pair
                if current_sequence: # Ensure we have a sequence stored
                    if len(current_sequence) == len(current_structure):
                        sequences.append(current_sequence)
                        structures.append(current_structure)
                    else:
                        print(f"Warning @ line ~{line_num}: Sequence/Structure length mismatch. "
                              f"Seq len: {len(current_sequence)}, Struct len: {len(current_structure)}. Skipping pair.")
                else:
                    print(f"Warning @ line ~{line_num}: Found structure but no preceding sequence stored. Skipping.")

                # Reset for the next block
                current_sequence = None
                expected_line_type = 0 # Expect an ID (or blank line) next

    except Exception as e:
        print(f"An error occurred during file reading/parsing of {file_path}: {e}")
        import traceback
        traceback.print_exc()
        return [], [] # Return empty on error

    print(f"Successfully loaded {len(sequences)} sequences and {len(structures)} structures from {file_path}.")
    if len(sequences) == 0 and len(lines) > 1:
         print("Warning: Failed to load any valid sequence/structure pairs. Check file format/content.")
    elif len(sequences) != len(structures):
         # This check is belt-and-suspenders if pair validation works
         print(f"Error: Final count mismatch! Sequences: {len(sequences)}, Structures: {len(structures)}")

    return sequences, structures

# -----------------------------------------------------------------------------
# Requirement 4: Define feature generation functions
# -----------------------------------------------------------------------------
def generate_features(sequences, structures, aa_map, ss_map, max_seq_length, filter_unknown=True):
    """
    Generates one-hot encoded features for sequences (X) and labels (Y),
    and pads them to a maximum length. This creates data suitable for RNNs.

    Args:
        sequences (list): List of amino acid sequence strings.
        structures (list): List of secondary structure strings.
        aa_map (dict): Mapping from amino acid characters to integers.
        ss_map (dict): Mapping from structure characters to integers.
        max_seq_length (int): The length to pad sequences and structures to.
        filter_unknown (bool): If True, sequences containing unknown amino acids ('X')
                               will be skipped.

    Returns:
        tuple: A tuple containing two NumPy arrays:
               - X (np.array): Padded one-hot encoded sequences
                               (shape: num_samples, max_seq_length, num_aa_features).
               - Y (np.array): Padded one-hot encoded structures
                               (shape: num_samples, max_seq_length, num_ss_features).
               Returns empty arrays of the correct shape if no valid sequences are processed.
    """
    num_aa_features = len(aa_map)
    num_ss_features = len(ss_map)
    X_list = [] # List to hold one-hot encoded sequences (before padding)
    Y_list = [] # List to hold one-hot encoded structures (before padding)

    skipped_count = 0
    original_count = len(sequences)
    print(f"Processing {original_count} raw sequences...")

    for i, seq in enumerate(sequences):
        # Basic check (already done in data_loader, but good for robustness here)
        if i >= len(structures):
            print(f"Warning: Index {i} out of bounds for structures list (length {len(structures)}). Stopping feature generation.")
            break
        struct = structures[i]
        if len(seq) != len(struct):
            # This should ideally not happen if data_loader worked correctly
            print(f"Warning (Feature Gen): Sequence/structure length mismatch at index {i}. Skipping.")
            skipped_count += 1
            continue

        # Optional: Filter sequences containing unknown amino acids
        if filter_unknown and UNKNOWN_AMINO_ACID in seq:
            skipped_count += 1
            continue

        # --- One-hot encode sequence (X) ---
        x_one_hot = np.zeros((len(seq), num_aa_features), dtype=np.float32)
        valid_seq = True
        for j, aa in enumerate(seq):
            if aa in aa_map:
                x_one_hot[j, aa_map[aa]] = 1.0
            else:
                # This should only happen if filter_unknown is False and 'X' (or other chars) are present
                print(f"Warning: Unknown amino acid '{aa}' found in sequence {i} at pos {j}. Skipping sequence.")
                valid_seq = False
                skipped_count +=1
                break
        if not valid_seq:
            continue # Skip to next sequence

        # --- One-hot encode structure (Y) ---
        y_one_hot = np.zeros((len(struct), num_ss_features), dtype=np.float32)
        valid_struct = True
        for j, ss in enumerate(struct):
            if ss in ss_map:
                y_one_hot[j, ss_map[ss]] = 1.0
            else:
                 print(f"Warning: Unknown structure char '{ss}' found in structure {i} at pos {j}. Skipping sequence.")
                 valid_struct = False
                 skipped_count += 1 # Count as skipped if structure is bad
                 break
        if not valid_struct:
             # We already one-hot encoded X, but since Y is invalid, discard this pair
             continue # Skip to next sequence

        # If both sequence and structure are valid, add to lists
        X_list.append(x_one_hot)
        Y_list.append(y_one_hot)

    processed_count = len(X_list)
    print(f"Successfully processed {processed_count} sequences.")
    if skipped_count > 0:
        print(f"Skipped {skipped_count} sequences out of {original_count} (due to length mismatch, '{UNKNOWN_AMINO_ACID}', or unknown chars).")

    # Handle case where NO sequences were processed
    if not X_list:
        print("Warning: No valid sequences left after filtering/processing.")
        # Return empty arrays with the correct number of dimensions for downstream consistency
        return np.array([]).reshape(0, max_seq_length, num_aa_features), \
               np.array([]).reshape(0, max_seq_length, num_ss_features)

    # --- Pad sequences to max_seq_length ---
    # 'padding=post' adds padding (zeros) at the end of the sequence.
    # 'value=0.0' uses zero vectors for padding.
    # 'dtype=float32' is common for NN inputs.
    print(f"Padding sequences to maximum length: {max_seq_length}")
    X_padded = pad_sequences(X_list, maxlen=max_seq_length, padding='post', dtype='float32', value=0.0)
    Y_padded = pad_sequences(Y_list, maxlen=max_seq_length, padding='post', dtype='float32', value=0.0)

    print(f"Generated features shapes -> X: {X_padded.shape}, Y: {Y_padded.shape}")
    # Expected shape: (num_samples, max_seq_length, num_features_per_step)
    return X_padded, Y_padded


# -----------------------------------------------------------------------------
# Main Script Logic: Requirements 1, 2, and Orchestration
# -----------------------------------------------------------------------------

print("\n" + "="*50)
print("Starting Data Preparation Pipeline")
print("="*50)

# --- Define File Paths and URLs ---
train_url = "http://calla.rnet.missouri.edu/cheng_courses/mlbioinfo/ss_train.txt"
test_url = "https://calla.rnet.missouri.edu/cheng_courses/mlbioinfo/ss_test.txt"
train_file_path = "ss_train.txt"
test_file_path = "ss_test.txt"

# --- Requirement 1 & 2 (Part 1): Download Data ---
print("\n--- Step 1: Checking/Downloading Data Files ---")
for url, file_path in [(train_url, train_file_path), (test_url, test_file_path)]:
    if not os.path.exists(file_path):
        print(f"Downloading {os.path.basename(file_path)} from {url}...")
        try:
            # Using os.system - requires wget to be installed on the system
            # Alternatives: requests library, urllib
            download_command = f"wget -O {file_path} {url}"
            print(f"Executing: {download_command}")
            status = os.system(download_command)
            if status != 0:
                print(f"Error: Download failed for {file_path} (wget returned non-zero status: {status}).")
                print("Please check the URL and your internet connection, or download manually.")
                exit() # Stop if download fails
            print(f"{os.path.basename(file_path)} downloaded successfully.")
        except Exception as e:
            print(f"Error during download of {file_path}: {e}")
            print("Please check if 'wget' is installed and in your PATH, or download manually.")
            exit() # Stop if download fails
    else:
        print(f"{os.path.basename(file_path)} already exists locally.")

# --- Requirement 1 & 3: Load Raw Data using data_loader ---
print("\n--- Step 2: Loading Raw Data ---")
raw_sequences_train, raw_structures_train = data_loader(train_file_path)
raw_sequences_test, raw_structures_test = data_loader(test_file_path)

# --- Basic Validation After Loading ---
if not raw_sequences_train or not raw_structures_train:
    print("\nError: Failed to load sufficient training data. Exiting.")
    exit()
if not raw_sequences_test or not raw_structures_test:
    print("\nWarning: Failed to load test data. Test set will be empty.")
    # Allow continuing, but test set evaluation won't be possible.

# --- Determine Maximum Sequence Length for Padding ---
# Important: Calculate based on VALID sequences across ALL data before processing/filtering 'X'
print("\n--- Step 3: Determining Maximum Sequence Length ---")
all_raw_sequences = raw_sequences_train + raw_sequences_test
valid_sequences = [seq for seq in all_raw_sequences if UNKNOWN_AMINO_ACID not in seq]

if not valid_sequences:
     print("\nError: No valid sequences found (without 'X') across train and test sets. Cannot determine padding length.")
     exit()

max_len = max(len(seq) for seq in valid_sequences)
print(f"Maximum valid sequence length found (used for padding): {max_len}")

# --- Requirement 1 & 4: Generate Features (One-Hot Encode + Pad) ---
print("\n--- Step 4: Generating Features (One-Hot Encoding & Padding) ---")
print("Processing Training Data...")
X_train_full, Y_train_full = generate_features(
    raw_sequences_train,
    raw_structures_train,
    amino_acid_mapping,
    secondary_structure_mapping,
    max_len,
    filter_unknown=True # Skip sequences with 'X'
)

print("\nProcessing Test Data...")
X_test, Y_test = generate_features(
    raw_sequences_test,
    raw_structures_test,
    amino_acid_mapping,
    secondary_structure_mapping,
    max_len,
    filter_unknown=True # Skip sequences with 'X'
)

# --- Check if Feature Generation Succeeded ---
if X_train_full.shape[0] == 0:
    print("\nError: No training samples remain after feature generation. Check data and filtering.")
    exit()
if X_test.shape[0] == 0:
    print("\nWarning: No test samples remain after feature generation. Test set is empty.")

# --- Requirement 2: Prepare training/validation/test sets ---
print("\n--- Step 5: Splitting Data into Training and Validation Sets ---")
# Split the *processed* full training set into actual training and validation sets
# Common split ratios are 80/20 or 90/10 for train/validation
validation_split_size = 0.2 # Use 20% of the original training data for validation

X_train, X_val, Y_train, Y_val = train_test_split(
    X_train_full, # Feature matrix from the original training file
    Y_train_full, # Label matrix from the original training file
    test_size=validation_split_size,
    random_state=42, # Ensures reproducible splits
    shuffle=True     # Shuffle data before splitting (good practice)
)

print(f"Split original training data ({X_train_full.shape[0]} samples) into:")
print(f"  - Training set:   {X_train.shape[0]} samples")
print(f"  - Validation set: {X_val.shape[0]} samples")


# --- Requirement 2: Report total samples and feature dimensions ---
print("\n" + "="*50)
print("Final Dataset Summary")
print("="*50)

total_processed_samples = X_train.shape[0] + X_val.shape[0] + X_test.shape[0]
print(f"Total number of samples processed (Train + Validation + Test): {total_processed_samples}")
print(f"  (Note: Original raw files had {len(raw_sequences_train)} train and {len(raw_sequences_test)} test sequences before filtering)")

print("\nDataset Shapes:")
print(f"  Training Set (X_train):   {X_train.shape}")
print(f"  Training Set (Y_train):   {Y_train.shape}")
print(f"  Validation Set (X_val):   {X_val.shape}")
print(f"  Validation Set (Y_val):   {Y_val.shape}")
print(f"  Test Set (X_test):      {X_test.shape}")
print(f"  Test Set (Y_test):      {Y_test.shape}")

print("\nFeature Dimensions:")
print(f"  Sequence Length (after padding):         {max_len}")
print(f"  Input Features per time step (Amino Acids): {NUM_AMINO_ACIDS} (One-hot)")
print(f"  Output Labels per time step (Structures):  {NUM_SECONDARY_STRUCTURES} (One-hot)")

print("\nData preparation complete. The variables X_train, Y_train, X_val, Y_val, X_test, Y_test are ready for model training/evaluation.")
print("="*50)

# --- Optional: Example output of the first processed sample ---
if X_train.shape[0] > 0:
    print("\nExample: First sample in the processed training set")
    print("  X_train[0] shape:", X_train[0].shape) # Should be (max_len, num_amino_acids)
    # print("  X_train[0] (first 10 steps):\n", X_train[0,:10,:]) # Print first 10 time steps features
    print("  Y_train[0] shape:", Y_train[0].shape) # Should be (max_len, num_structures)
    # print("  Y_train[0] (first 10 steps):\n", Y_train[0,:10,:]) # Print first 10 time steps labels

--- Configuration ---
Amino Acids Mapping: 
 {'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'Q': 5, 'E': 6, 'G': 7, 'H': 8, 'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14, 'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19}
Number of Amino Acid Classes: 20

Secondary Structure Mapping: 
 {'C': 0, 'H': 1, 'E': 2}
Number of Secondary Structure Classes: 3
------------------------------

Starting Data Preparation Pipeline

--- Step 1: Checking/Downloading Data Files ---
ss_train.txt already exists locally.
ss_test.txt already exists locally.

--- Step 2: Loading Raw Data ---
Attempting to load data from: ss_train.txt
Skipping header line: '1180 20 3'
Successfully loaded 1180 sequences and 1180 structures from ss_train.txt.
Attempting to load data from: ss_test.txt
Skipping header line: '126 20 3'
Successfully loaded 126 sequences and 126 structures from ss_test.txt.

--- Step 3: Determining Maximum Sequence Length ---
Maximum valid sequence length found (used for padding): 1231

--- Step 4: Genera

##PART 2

###TASK - 1

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, SimpleRNN, LSTM, GRU, Dense, TimeDistributed, Dropout, Bidirectional
# Make sure variables from Part I are defined before running this:
# Example placeholders (replace with actual values from your Part I output):
# max_len = 1231
# NUM_AMINO_ACIDS = 20
# NUM_SECONDARY_STRUCTURES = 3

# --- Check if variables are defined ---
try:
    # These should be defined from the previous part's execution
    _ = max_len
    _ = NUM_AMINO_ACIDS
    _ = NUM_SECONDARY_STRUCTURES
    print(f"Using variables from Part I: max_len={max_len}, NUM_AMINO_ACIDS={NUM_AMINO_ACIDS}, NUM_SECONDARY_STRUCTURES={NUM_SECONDARY_STRUCTURES}")
except NameError:
    print("\n--- WARNING ---")
    print("Variables `max_len`, `NUM_AMINO_ACIDS`, `NUM_SECONDARY_STRUCTURES` not found.")
    print("Please ensure Part I code has been run and these variables are defined.")
    print("Using placeholder values for demonstration purposes ONLY.")
    print("You MUST replace these with your actual data dimensions for correct results.")
    max_len = 500  # Example Placeholder - REPLACE
    NUM_AMINO_ACIDS = 20 # Example Placeholder - REPLACE
    NUM_SECONDARY_STRUCTURES = 3 # Example Placeholder - REPLACE
    print(f"Using PLACEHOLDERS: max_len={max_len}, NUM_AMINO_ACIDS={NUM_AMINO_ACIDS}, NUM_SECONDARY_STRUCTURES={NUM_SECONDARY_STRUCTURES}")
    print("---------------\n")


# Define RNN unit size (a hyperparameter you can tune)
RNN_UNITS = 64 # You can experiment with values like 32, 64, 128, 256
DROPOUT_RATE = 0.3 # Optional dropout for regularization

# Helper function to build and report model
def build_and_report_rnn_model(model_type, units, dropout_rate, input_shape, output_classes):
    """Builds, compiles, and prints the summary for a given RNN type."""
    if model_type == 'SimpleRNN':
        rnn_layer = SimpleRNN(units, return_sequences=True)
        model_name = 'SimpleRNN_Protein_SS_Model'
    elif model_type == 'LSTM':
        rnn_layer = LSTM(units, return_sequences=True)
        model_name = 'LSTM_Protein_SS_Model'
    elif model_type == 'GRU':
        rnn_layer = GRU(units, return_sequences=True)
        model_name = 'GRU_Protein_SS_Model'
    # Optional: Bidirectional Wrapper
    #elif model_type == 'BiLSTM':
    #    rnn_layer = Bidirectional(LSTM(units, return_sequences=True))
    #    model_name = 'BiLSTM_Protein_SS_Model'
    else:
        raise ValueError("Unsupported model_type. Choose 'SimpleRNN', 'LSTM', or 'GRU'.")

    print("\n" + "="*60)
    print(f"Building Model: {model_name}")
    print("="*60)

    model = Sequential(name=model_name)
    # Define Input Layer explicitly
    model.add(Input(shape=input_shape, name='Input_Sequence'))

    # --- RNN Layer ---
    # return_sequences=True is CRUCIAL for many-to-many prediction
    # It makes the RNN output a sequence of shape (batch, timesteps, units)
    # instead of just the final state (batch, units)
    model.add(rnn_layer)

    # Optional Dropout layer for regularization
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate, name='RNN_Dropout'))

    # --- Output Layer ---
    # We need to apply a Dense layer independently to each time step's output.
    # TimeDistributed wrapper does exactly this.
    # Output units = number of secondary structure classes (3)
    # Activation = softmax for multi-class probability output per time step
    model.add(TimeDistributed(Dense(output_classes, activation='softmax'), name='Output_Probabilities'))

    # --- Compile the Model ---
    # Loss function: categorical_crossentropy is standard for multi-class, one-hot encoded labels with softmax output.
    # Optimizer: Adam is a good default choice.
    # Metrics: Accuracy measures the fraction of correctly predicted labels per time step.
    # Note on Accuracy with Padding: Standard accuracy might be slightly inflated if padding isn't masked.
    # For more rigorous evaluation, consider using masked loss/metrics or sample weighting if padding is significant.
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy']) # Can add other metrics like tf.keras.metrics.Precision(), etc.

    # --- Report Summary ---
    print(f"\n--- Model Summary for {model_name} ---")
    model.summary() # Prints layer details and parameter counts

    # Explicitly report total parameters
    total_params = model.count_params()
    print(f"\nTotal number of parameters to optimize: {total_params:,}") # Use comma for readability
    print("="*60)

    return model

# Common input shape: (sequence_length, features_per_step)
input_shape = (max_len, NUM_AMINO_ACIDS)
output_classes = NUM_SECONDARY_STRUCTURES

# =====================================================
# Task 1: Create and Report the three RNN Models
# =====================================================

# 1. SimpleRNN Model
model_simple_rnn = build_and_report_rnn_model(
    model_type='SimpleRNN',
    units=RNN_UNITS,
    dropout_rate=DROPOUT_RATE,
    input_shape=input_shape,
    output_classes=output_classes
)

# 2. LSTM Model
model_lstm = build_and_report_rnn_model(
    model_type='LSTM',
    units=RNN_UNITS,
    dropout_rate=DROPOUT_RATE,
    input_shape=input_shape,
    output_classes=output_classes
)

# 3. GRU Model
model_gru = build_and_report_rnn_model(
    model_type='GRU',
    units=RNN_UNITS,
    dropout_rate=DROPOUT_RATE,
    input_shape=input_shape,
    output_classes=output_classes
)

# You can also optionally explore a Bidirectional model:
# model_bilstm = build_and_report_rnn_model(
#     model_type='BiLSTM',
#     units=RNN_UNITS,
#     dropout_rate=DROPOUT_RATE,
#     input_shape=input_shape,
#     output_classes=output_classes
# )

Using variables from Part I: max_len=1231, NUM_AMINO_ACIDS=20, NUM_SECONDARY_STRUCTURES=3

Building Model: SimpleRNN_Protein_SS_Model

--- Model Summary for SimpleRNN_Protein_SS_Model ---



Total number of parameters to optimize: 5,635

Building Model: LSTM_Protein_SS_Model

--- Model Summary for LSTM_Protein_SS_Model ---



Total number of parameters to optimize: 21,955

Building Model: GRU_Protein_SS_Model

--- Model Summary for GRU_Protein_SS_Model ---



Total number of parameters to optimize: 16,707


###TASK - 2

In [None]:
import numpy as np
import tensorflow as tf

# --- Configuration ---
NUM_SIMULATED_SAMPLES = 32 # Number of random sequences to generate for testing
BATCH_SIZE_SIM = 16      # Batch size for simulated training
EPOCHS_SIM = 2           # Number of epochs for simulated training (just need 1 or 2)

print("\n" + "="*60)
print("Part II - Task 2: Test Model Training with Simulated Data")
print("="*60)

# --- Reuse Variables from Part I & Models from Part II Task 1 ---
# Ensure these variables and models are defined from previous steps.
try:
    # Variables from Part I
    _ = max_len
    _ = NUM_AMINO_ACIDS
    _ = NUM_SECONDARY_STRUCTURES
    # Models from Part II, Task 1
    _ = model_simple_rnn
    _ = model_lstm
    _ = model_gru
    print("Using variables and models defined in previous steps:")
    print(f"  max_len={max_len}, NUM_AMINO_ACIDS={NUM_AMINO_ACIDS}, NUM_SECONDARY_STRUCTURES={NUM_SECONDARY_STRUCTURES}")
    print(f"  Models: {model_simple_rnn.name}, {model_lstm.name}, {model_gru.name}")
    models_to_test = {
        "SimpleRNN": model_simple_rnn,
        "LSTM": model_lstm,
        "GRU": model_gru
    }
except NameError as e:
    print("\n--- ERROR ---")
    print(f"Variable or Model not found: {e}")
    print("Please ensure Part I (data preparation) and Part II Task 1 (model definition)")
    print("have been executed successfully in the same session before running this task.")
    print("Cannot proceed with simulated training test.")
    print("-------------")
    # Exit or handle appropriately if models/vars aren't defined
    exit() # Or raise an exception

# --- Step 1: Generate Simulated Data ---
print(f"\nGenerating simulated data with {NUM_SIMULATED_SAMPLES} samples...")

# Simulate Input X: (num_samples, max_len, NUM_AMINO_ACIDS)
# Generate random integer indices for amino acids (0 to NUM_AMINO_ACIDS-1)
print("Generating random indices for X...")
x_indices_sim = np.random.randint(0, NUM_AMINO_ACIDS,
                                  size=(NUM_SIMULATED_SAMPLES, max_len),
                                  dtype=np.int32) # Specify int dtype for indices

# Convert indices to one-hot encoding (default dtype might be float64)
print("Converting X indices to one-hot...")
X_sim_temp = tf.keras.utils.to_categorical(x_indices_sim, num_classes=NUM_AMINO_ACIDS)
# Explicitly cast to float32
X_sim = X_sim_temp.astype(np.float32)
print(f"X_sim intermediate dtype: {X_sim_temp.dtype}, final dtype: {X_sim.dtype}")


# Simulate Output Y: (num_samples, max_len, NUM_SECONDARY_STRUCTURES)
# Generate random integer indices for secondary structures (0 to NUM_SECONDARY_STRUCTURES-1)
print("Generating random indices for Y...")
y_indices_sim = np.random.randint(0, NUM_SECONDARY_STRUCTURES,
                                  size=(NUM_SIMULATED_SAMPLES, max_len),
                                  dtype=np.int32) # Specify int dtype for indices

# Convert indices to one-hot encoding (default dtype might be float64)
print("Converting Y indices to one-hot...")
Y_sim_temp = tf.keras.utils.to_categorical(y_indices_sim, num_classes=NUM_SECONDARY_STRUCTURES)
# Explicitly cast to float32
Y_sim = Y_sim_temp.astype(np.float32)
print(f"Y_sim intermediate dtype: {Y_sim_temp.dtype}, final dtype: {Y_sim.dtype}")


print(f"\nSimulated data generated:")
print(f"  X_sim shape: {X_sim.shape}, X_sim dtype: {X_sim.dtype}") # Should be (NUM_SAMPLES, max_len, NUM_AA), float32
print(f"  Y_sim shape: {Y_sim.shape}, Y_sim dtype: {Y_sim.dtype}") # Should be (NUM_SAMPLES, max_len, NUM_SS), float32


# --- Step 2: Test Training for Each Model ---
print(f"\nTesting model training for {EPOCHS_SIM} epochs with batch size {BATCH_SIZE_SIM}...")

training_successful = True # Flag to track overall success

for model_name, model in models_to_test.items():
    print(f"\n--- Testing Training for: {model.name} ---")
    try:
        history = model.fit(
            X_sim,              # Simulated input features
            Y_sim,              # Simulated output labels
            epochs=EPOCHS_SIM,
            batch_size=BATCH_SIZE_SIM,
            verbose=1           # Show progress bar (1) or epoch results (2)
        )
        print(f"\n[SUCCESS] Training simulation completed without errors for {model.name}.")
        # Optional: Print last epoch loss/accuracy
        # last_epoch = len(history.history['loss']) - 1
        # print(f"  Final simulated loss: {history.history['loss'][last_epoch]:.4f}")
        # print(f"  Final simulated accuracy: {history.history['accuracy'][last_epoch]:.4f}")

    except Exception as e:
        print(f"\n[FAILURE] An error occurred during training simulation for {model.name}:")
        print(f"  Error Type: {type(e).__name__}")
        print(f"  Error Message: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for debugging
        training_successful = False # Mark overall test as failed

print("\n" + "="*60)
if training_successful:
    print("Task 2 Requirement Met: All models successfully completed the training simulation loop without errors.")
else:
    print("Task 2 Requirement NOT Met: One or more models encountered errors during the training simulation.")
print("="*60)


Part II - Task 2: Test Model Training with Simulated Data
Using variables and models defined in previous steps:
  max_len=1231, NUM_AMINO_ACIDS=20, NUM_SECONDARY_STRUCTURES=3
  Models: SimpleRNN_Protein_SS_Model, LSTM_Protein_SS_Model, GRU_Protein_SS_Model

Generating simulated data with 32 samples...
Generating random indices for X...
Converting X indices to one-hot...
X_sim intermediate dtype: float64, final dtype: float32
Generating random indices for Y...
Converting Y indices to one-hot...
Y_sim intermediate dtype: float64, final dtype: float32

Simulated data generated:
  X_sim shape: (32, 1231, 20), X_sim dtype: float32
  Y_sim shape: (32, 1231, 3), Y_sim dtype: float32

Testing model training for 2 epochs with batch size 16...

--- Testing Training for: SimpleRNN_Protein_SS_Model ---
Epoch 1/2
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 152ms/step - accuracy: 0.3345 - loss: 1.1742
Epoch 2/2
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/

###TASK - 3

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import os  # <--- IMPORT OS MODULE HERE

# --- Configuration ---
EPOCHS = 20          # Maximum number of epochs (EarlyStopping will likely stop it sooner)
BATCH_SIZE = 32      # Samples per gradient update
PATIENCE = 5         # How many epochs to wait for improvement before stopping early
MODEL_SAVE_DIR = "saved_models" # Directory to save best models (optional)

print("\n" + "="*60)
print("Part II - Task 3: Train and Evaluate Models on Real Data")
print("="*60)

# --- Ensure Data and Models Exist ---
try:
    # Data from Part I
    # ---> !!! THESE MUST EXIST FROM RUNNING PART I CODE FIRST !!! <---
    _ = X_train, Y_train, X_val, Y_val, X_test, Y_test
    # Models from Part II, Task 1
    # ---> !!! THESE MUST EXIST FROM RUNNING PART II TASK 1 CODE FIRST !!! <---
    _ = model_simple_rnn, model_lstm, model_gru
    # Variables from Part I
    # ---> !!! THESE MUST EXIST FROM RUNNING PART I CODE FIRST !!! <---
    _ = max_len, NUM_AMINO_ACIDS, NUM_SECONDARY_STRUCTURES

    print("Successfully loaded data and models from previous parts.")
    print(f"Training data shape: X={X_train.shape}, Y={Y_train.shape}")
    print(f"Validation data shape: X={X_val.shape}, Y={Y_val.shape}")
    print(f"Test data shape: X={X_test.shape}, Y={Y_test.shape}")

    models_to_train = {
        "SimpleRNN": model_simple_rnn,
        "LSTM": model_lstm,
        "GRU": model_gru
    }
    datasets = {
        "Training": (X_train, Y_train),
        "Validation": (X_val, Y_val),
        "Test": (X_test, Y_test)
    }
    results = {} # To store evaluation results

except NameError as e:
    print("\n--- ERROR ---")
    print(f"Variable or Model not found: {e}")
    print("Please ensure Part I (data prep) and Part II Task 1 (model definition)")
    print("have been executed successfully in the same session before running this task.")
    print("Cannot proceed with training.")
    print("-------------")
    # Ensure exit() is uncommented or handle appropriately if running interactively
    exit() # Stop if setup is incomplete

# --- Helper Function for Masked Evaluation ---
# (Keep the evaluate_model_performance function exactly as defined before)
def evaluate_model_performance(model, x_data, y_data, dataset_name):
    """Evaluates model, handling padding for precise metrics."""
    print(f"\n--- Evaluating on {dataset_name} Set ---")

    # 1. Get Loss and Accuracy (Keras handles masking internally if supported, but generally okay for overall accuracy)
    loss, accuracy = model.evaluate(x_data, y_data, verbose=0, batch_size=BATCH_SIZE) # Added batch_size
    print(f"  Keras Evaluate -> Loss: {loss:.4f}, Accuracy (potentially includes padding): {accuracy:.4f}")

    # 2. Get Predictions (Probabilities)
    print("  Generating predictions...")
    y_pred_prob = model.predict(x_data, batch_size=BATCH_SIZE, verbose=0) # Shape: (N, L, 3)

    # 3. Convert Probabilities and True Labels to Class Indices
    y_pred_indices = np.argmax(y_pred_prob, axis=-1) # Shape: (N, L)
    y_true_indices = np.argmax(y_data, axis=-1)     # Shape: (N, L)

    # 4. Create Mask to Ignore Padding
    # Padded input features (X) are all zeros. Sum across feature dim. If sum > 0, it's not padding.
    mask = np.sum(x_data, axis=-1) > 1e-6  # Use a small threshold for float comparison Shape: (N, L)

    # 5. Flatten arrays and apply mask
    y_pred_flat_masked = y_pred_indices[mask]
    y_true_flat_masked = y_true_indices[mask]

    print(f"  Total time steps: {mask.size}, Non-padded steps: {np.sum(mask)}")
    if np.sum(mask) == 0:
        print("  Warning: No non-padded steps found based on input mask. Cannot calculate detailed metrics.")
        # Return metrics based on Keras evaluation if available, otherwise Nones
        return {
            "loss": loss,
            "accuracy_keras": accuracy,
            "accuracy": 0.0 if accuracy is not None else None, # Avoid error if loss/acc were None
            "precision": 0.0,
            "recall": 0.0,
            "f1_score": 0.0,
            "report": "No non-padded steps"}

    # 6. Calculate Detailed Metrics (Precision, Recall, F1) on *masked* data
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true_flat_masked,
        y_pred_flat_masked,
        average='weighted',
        zero_division=0
    )
    accuracy_masked = accuracy_score(y_true_flat_masked, y_pred_flat_masked)

    print(f"  Metrics (on non-padded steps):")
    print(f"    Accuracy:  {accuracy_masked:.4f}")
    print(f"    Precision: {precision:.4f}")
    print(f"    Recall:    {recall:.4f}")
    print(f"    F1-score:  {f1:.4f}")

    # 7. Optional: Classification Report (per class)
    try:
        # Generate target names dynamically based on NUM_SECONDARY_STRUCTURES if needed
        # Defaulting to ['C', 'H', 'E'] based on previous context
        target_names_list = list(SECONDARY_STRUCTURES) if 'SECONDARY_STRUCTURES' in globals() else [f'Class_{i}' for i in range(NUM_SECONDARY_STRUCTURES)]

        report = classification_report(
            y_true_flat_masked,
            y_pred_flat_masked,
            target_names=target_names_list,
            digits=4, # Increase precision in report
            zero_division=0
        )
        print("  Classification Report (on non-padded steps):\n", report)
    except Exception as report_err:
        print(f"  Could not generate classification report: {report_err}")
        report = "Error generating report"

    return {
        "loss": loss,
        "accuracy_keras": accuracy,
        "accuracy": accuracy_masked,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "report": report
    }


# --- Create directory for saving models (optional) ---
if not os.path.exists(MODEL_SAVE_DIR):
    try:
        os.makedirs(MODEL_SAVE_DIR)
        print(f"Created directory for saving models: {MODEL_SAVE_DIR}")
    except OSError as e:
        print(f"Error creating directory {MODEL_SAVE_DIR}: {e}")
        # Decide if this is critical - maybe proceed without saving?
        # exit()

# --- Train and Evaluate Each Model ---
# (Keep the training and evaluation loop exactly as defined before)
for model_name, model in models_to_train.items():
    print("\n" + "#"*60)
    print(f"# Training Model: {model.name}")
    print("#"*60)

    # Define Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=PATIENCE,
        verbose=1,
        restore_best_weights=True
    )
    callbacks_list = [early_stopping]

    # Train the Model
    history = model.fit(
        X_train, Y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_val, Y_val),
        callbacks=callbacks_list,
        verbose=1
    )

    print(f"\n--- Finished Training {model.name} ---")

    # Evaluate the Model
    model_results = {}
    for dataset_name, (x_data, y_data) in datasets.items():
         if dataset_name == "Test" and (x_data is None or x_data.shape[0] == 0): # Check x_data directly
             print("\nSkipping evaluation on empty Test set.")
             model_results[dataset_name] = {"loss": None, "accuracy": None, "precision": None, "recall": None, "f1_score": None, "report": "Test set empty"}
             continue
         # Ensure y_data also exists and has samples if x_data is valid
         if y_data is None or y_data.shape[0] == 0:
             print(f"\nSkipping evaluation on {dataset_name} set due to empty Y data.")
             model_results[dataset_name] = {"loss": None, "accuracy": None, "precision": None, "recall": None, "f1_score": None, "report": f"{dataset_name} set Y empty"}
             continue

         model_results[dataset_name] = evaluate_model_performance(model, x_data, y_data, dataset_name)

    results[model_name] = model_results

    # clear up GPU memory
    import gc
    tf.keras.backend.clear_session()
    gc.collect()

# --- Requirement: Report Final Metrics ---
# (Keep the reporting section exactly as defined before)
print("\n" + "="*70)
print("Final Performance Summary")
print("="*70)

for model_name, model_results in results.items():
    print(f"\n--- {model_name} ---")
    for dataset_name, metrics in model_results.items():
        # Check if 'accuracy' exists and is not None before printing
        if 'accuracy' in metrics and metrics['accuracy'] is not None:
            print(f"  {dataset_name} Set:")
            print(f"    Accuracy:  {metrics['accuracy']:.4f} (on non-padded steps)")
            print(f"    Precision: {metrics['precision']:.4f}")
            print(f"    Recall:    {metrics['recall']:.4f}")
            print(f"    F1-score:  {metrics['f1_score']:.4f}")
            print(f"    Loss:      {metrics['loss']:.4f}")
        else:
            print(f"  {dataset_name} Set: Not Evaluated ({metrics.get('report', 'Reason unknown')})")
    print("-" * 40)

print("\n" + "="*70)
print("Detailed Classification Reports (on non-padded steps)")
print("="*70)
for model_name, model_results in results.items():
    print(f"\n--- {model_name} ---")
    for dataset_name, metrics in model_results.items():
        # Check if 'report' exists and evaluation happened
        if 'report' in metrics and metrics['accuracy'] is not None:
             print(f"  {dataset_name} Set Report:")
             print(metrics['report'])
             print("-" * 20)
        elif 'report' in metrics:
             print(f"  {dataset_name} Set: Not Evaluated ({metrics['report']})")
             print("-" * 20)
    print("-" * 40)


Part II - Task 3: Train and Evaluate Models on Real Data
Successfully loaded data and models from previous parts.
Training data shape: X=(932, 1231, 20), Y=(932, 1231, 3)
Validation data shape: X=(234, 1231, 20), Y=(234, 1231, 3)
Test data shape: X=(124, 1231, 20), Y=(124, 1231, 3)

############################################################
# Training Model: SimpleRNN_Protein_SS_Model
############################################################
Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m627s[0m 13s/step - accuracy: 0.4506 - loss: 0.2173 - val_accuracy: 0.3362 - val_loss: 0.2008
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 100ms/step - accuracy: 0.3322 - loss: 0.1982 - val_accuracy: 0.8610 - val_loss: 0.1941
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 97ms/step - accuracy: 0.3783 - loss: 0.1917 - val_accuracy: 0.1120 - val_loss: 0.1892
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

###TASK - 4

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import os
import copy # Needed for Bidirectional layer definition from config potentiallyb

# --- Configuration (can reuse from Task 3 or redefine if needed) ---
EPOCHS = 20          # Maximum number of epochs
BATCH_SIZE = 8      # Samples per gradient update
PATIENCE = 5         # How many epochs to wait for improvement before stopping early
RNN_UNITS = 64       # RNN units (same as before for comparison)
DROPOUT_RATE = 0.3   # Dropout rate (same as before for comparison)
# MODEL_SAVE_DIR = "saved_models" # Defined in Task 3

print("\n" + "="*60)
print("Part II - Task 4: Train and Evaluate Bidirectional Models")
print("="*60)

# --- Ensure Data and Base Variables Exist ---
# --- Also reuse the evaluate_model_performance function ---
try:
    # Data from Part I
    _ = X_train, Y_train, X_val, Y_val, X_test, Y_test
    # Variables from Part I
    _ = max_len, NUM_AMINO_ACIDS, NUM_SECONDARY_STRUCTURES
    # Evaluation function from Task 3
    _ = evaluate_model_performance
    # Datasets dictionary from Task 3
    _ = datasets
    # Results dictionary from Task 3 (we'll add to this)
    _ = results

    print("Successfully loaded data, variables, evaluation function, and datasets dict from previous parts.")
    print(f"Input shape: {(max_len, NUM_AMINO_ACIDS)}, Output classes: {NUM_SECONDARY_STRUCTURES}")

except NameError as e:
    print("\n--- ERROR ---")
    print(f"Variable, Function or Dictionary not found: {e}")
    print("Please ensure Part I (data prep), Part II Task 1 (model def), and Task 3 (uni-training/eval func)")
    print("have been executed successfully in the same session before running this task.")
    print("Cannot proceed with training.")
    print("-------------")
    exit() # Stop if setup is incomplete


# --- Helper function to build Bidirectional models ---
# (Similar to Task 1, but incorporates Bidirectional wrapper)
def build_and_report_bidirectional_model(model_type, units, dropout_rate, input_shape, output_classes, merge_mode='concat'):
    """Builds, compiles, and prints the summary for a Bidirectional RNN model."""

    if model_type == 'BiLSTM':
        # Define the base LSTM layer
        base_rnn_layer = LSTM(units, return_sequences=True) # Must return sequences!
        model_name = 'BiLSTM_Protein_SS_Model'
    elif model_type == 'BiGRU':
        # Define the base GRU layer
        base_rnn_layer = GRU(units, return_sequences=True) # Must return sequences!
        model_name = 'BiGRU_Protein_SS_Model'
    else:
        raise ValueError("Unsupported model_type for bidirectional. Choose 'BiLSTM' or 'BiGRU'.")

    print("\n" + "="*60)
    print(f"Building Model: {model_name}")
    print("="*60)

    model = Sequential(name=model_name)
    # Define Input Layer explicitly
    model.add(Input(shape=input_shape, name='Input_Sequence'))

    # --- Bidirectional RNN Layer ---
    # Wrap the base RNN layer with Bidirectional
    # merge_mode='concat' (default) concatenates forward and backward outputs,
    # doubling the feature dimension passed to the next layer.
    model.add(Bidirectional(base_rnn_layer, merge_mode=merge_mode))

    # Optional Dropout layer for regularization
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate, name='BiRNN_Dropout'))

    # --- Output Layer (Same as before) ---
    # TimeDistributed applies Dense layer to each time step
    model.add(TimeDistributed(Dense(output_classes, activation='softmax'), name='Output_Probabilities'))

    # --- Compile the Model (Same as before) ---
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # --- Report Summary ---
    print(f"\n--- Model Summary for {model_name} ---")
    model.summary()
    total_params = model.count_params()
    print(f"\nTotal number of parameters to optimize: {total_params:,}")
    print("="*60)

    return model

# --- Define and Build Bidirectional Models ---
input_shape = (max_len, NUM_AMINO_ACIDS)
output_classes = NUM_SECONDARY_STRUCTURES

model_bilstm = build_and_report_bidirectional_model(
    model_type='BiLSTM',
    units=RNN_UNITS,
    dropout_rate=DROPOUT_RATE,
    input_shape=input_shape,
    output_classes=output_classes
)

model_bigru = build_and_report_bidirectional_model(
    model_type='BiGRU',
    units=RNN_UNITS,
    dropout_rate=DROPOUT_RATE,
    input_shape=input_shape,
    output_classes=output_classes
)

# Dictionary for the new models to train
models_to_train_bi = {
    "BiLSTM": model_bilstm,
    "BiGRU": model_bigru
}

# --- Train and Evaluate Bidirectional Models ---
# (Using the same loop structure and evaluation function as Task 3)
for model_name, model in models_to_train_bi.items():
    print("\n" + "#"*60)
    print(f"# Training Model: {model.name}")
    print("#"*60)

    # Define Callbacks (same as Task 3)
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=PATIENCE,
        verbose=1,
        restore_best_weights=True
    )
    callbacks_list = [early_stopping]

    # Train the Model
    history = model.fit(
        X_train, Y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_val, Y_val),
        callbacks=callbacks_list,
        verbose=1
    )

    print(f"\n--- Finished Training {model.name} ---")

    # Evaluate the Model using the function from Task 3
    model_results_bi = {}
    for dataset_name, (x_data, y_data) in datasets.items():
         # Ensure test set is only evaluated if it exists and is non-empty
         if dataset_name == "Test" and (x_data is None or x_data.shape[0] == 0):
             print("\nSkipping evaluation on empty Test set.")
             model_results_bi[dataset_name] = {"loss": None, "accuracy": None, "precision": None, "recall": None, "f1_score": None, "report": "Test set empty"}
             continue
         if y_data is None or y_data.shape[0] == 0:
             print(f"\nSkipping evaluation on {dataset_name} set due to empty Y data.")
             model_results_bi[dataset_name] = {"loss": None, "accuracy": None, "precision": None, "recall": None, "f1_score": None, "report": f"{dataset_name} set Y empty"}
             continue

         model_results_bi[dataset_name] = evaluate_model_performance(model, x_data, y_data, dataset_name)

    # Add the results for this bidirectional model to the main results dictionary
    results[model_name] = model_results_bi

    import gc
    tf.keras.backend.clear_session()
    gc.collect()


# --- Requirement: Report Final Metrics (Now includes Bidirectional models) ---
print("\n" + "="*70)
print("Final Performance Summary (Including Bidirectional Models)")
print("="*70)

# Iterate through the updated results dictionary which now contains all models
for model_name, model_results in results.items():
    print(f"\n--- {model_name} ---")
    for dataset_name, metrics in model_results.items():
        if 'accuracy' in metrics and metrics['accuracy'] is not None:
            print(f"  {dataset_name} Set:")
            print(f"    Accuracy:  {metrics['accuracy']:.4f} (on non-padded steps)")
            print(f"    Precision: {metrics['precision']:.4f}")
            print(f"    Recall:    {metrics['recall']:.4f}")
            print(f"    F1-score:  {metrics['f1_score']:.4f}")
            print(f"    Loss:      {metrics['loss']:.4f}")
        else:
            print(f"  {dataset_name} Set: Not Evaluated ({metrics.get('report', 'Reason unknown')})")
    print("-" * 40)


print("\n" + "="*70)
print("Detailed Classification Reports (Including Bidirectional Models)")
print("="*70)
# Iterate through the updated results dictionary
for model_name, model_results in results.items():
    print(f"\n--- {model_name} ---")
    for dataset_name, metrics in model_results.items():
        if 'report' in metrics and metrics.get('accuracy') is not None: # Check if report exists and evaluation happened
             print(f"  {dataset_name} Set Report:")
             print(metrics['report'])
             print("-" * 20)
        elif 'report' in metrics:
             print(f"  {dataset_name} Set: Not Evaluated ({metrics['report']})")
             print("-" * 20)
    print("-" * 40)


Part II - Task 4: Train and Evaluate Bidirectional Models
Successfully loaded data, variables, evaluation function, and datasets dict from previous parts.
Input shape: (1231, 20), Output classes: 3

Building Model: BiLSTM_Protein_SS_Model

--- Model Summary for BiLSTM_Protein_SS_Model ---



Total number of parameters to optimize: 43,907

Building Model: BiGRU_Protein_SS_Model

--- Model Summary for BiGRU_Protein_SS_Model ---



Total number of parameters to optimize: 33,411

############################################################
# Training Model: BiLSTM_Protein_SS_Model
############################################################
Epoch 1/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 375ms/step - accuracy: 0.8957 - loss: 0.1932 - val_accuracy: 0.9116 - val_loss: 0.1832
Epoch 2/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 242ms/step - accuracy: 0.9170 - loss: 0.1753 - val_accuracy: 0.9237 - val_loss: 0.1692
Epoch 3/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 240ms/step - accuracy: 0.9234 - loss: 0.1664 - val_accuracy: 0.9240 - val_loss: 0.1678
Epoch 4/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 239ms/step - accuracy: 0.9258 - loss: 0.1618 - val_accuracy: 0.9269 - val_loss: 0.1628
Epoch 5/20
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 244ms/step - accuracy: 0.9273 - loss: 0.1606 - v

###TASK - 5

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Dropout, Dense, TimeDistributed, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import os

# --- Configuration ---
EPOCHS = 18
BATCH_SIZE = 16
PATIENCE = 5
MODEL_SAVE_DIR = "saved_models"

print("\n" + "="*60)
print("Part II - Task 3: Train and Evaluate Models on Real Data")
print("="*60)

# --- Ensure Data and Models Exist ---
try:
    _ = X_train, Y_train, X_val, Y_val, X_test, Y_test
    _ = model_simple_rnn, model_lstm, model_gru
    _ = max_len, NUM_AMINO_ACIDS, NUM_SECONDARY_STRUCTURES

    print("Successfully loaded data and models from previous parts.")
    print(f"Training data shape: X={X_train.shape}, Y={Y_train.shape}")
    print(f"Validation data shape: X={X_val.shape}, Y={Y_val.shape}")
    print(f"Test data shape: X={X_test.shape}, Y={Y_test.shape}")

    models_to_train = {}
    datasets = {
        "Training": (X_train, Y_train),
        "Validation": (X_val, Y_val),
        "Test": (X_test, Y_test)
    }
    results = {}

except NameError as e:
    print("\n--- ERROR ---")
    print(f"Variable or Model not found: {e}")
    print("Please ensure Part I (data prep) and Part II Task 1 (model definition)")
    print("have been executed successfully in the same session before running this task.")
    exit()

# --- Define CNN Model ---
def build_cnn_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Conv1D(filters=128, kernel_size=7, padding="same", activation="relu")(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Conv1D(filters=64, kernel_size=5, padding="same", activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Conv1D(filters=32, kernel_size=3, padding="same", activation="relu")(x)
    x = TimeDistributed(Dense(num_classes, activation="softmax"))(x)
    model = Model(inputs, x, name="cnn_model")
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model

# --- Add CNN to Models ---
model_cnn = build_cnn_model((max_len, NUM_AMINO_ACIDS), NUM_SECONDARY_STRUCTURES)
models_to_train["CNN"] = model_cnn

# --- Helper Function for Masked Evaluation ---
def evaluate_model_performance(model, x_data, y_data, dataset_name):
    print(f"\n--- Evaluating on {dataset_name} Set ---")
    loss, accuracy = model.evaluate(x_data, y_data, verbose=0, batch_size=BATCH_SIZE)
    print(f"  Keras Evaluate -> Loss: {loss:.4f}, Accuracy (potentially includes padding): {accuracy:.4f}")

    print("  Generating predictions...")
    y_pred_prob = model.predict(x_data, batch_size=BATCH_SIZE, verbose=0)
    y_pred_indices = np.argmax(y_pred_prob, axis=-1)
    y_true_indices = np.argmax(y_data, axis=-1)
    mask = np.sum(x_data, axis=-1) > 1e-6
    y_pred_flat_masked = y_pred_indices[mask]
    y_true_flat_masked = y_true_indices[mask]

    print(f"  Total time steps: {mask.size}, Non-padded steps: {np.sum(mask)}")
    if np.sum(mask) == 0:
        print("  Warning: No non-padded steps found based on input mask. Cannot calculate detailed metrics.")
        return {
            "loss": loss,
            "accuracy_keras": accuracy,
            "accuracy": 0.0 if accuracy is not None else None,
            "precision": 0.0,
            "recall": 0.0,
            "f1_score": 0.0,
            "report": "No non-padded steps"}

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true_flat_masked, y_pred_flat_masked, average='weighted', zero_division=0)
    accuracy_masked = accuracy_score(y_true_flat_masked, y_pred_flat_masked)

    print(f"  Metrics (on non-padded steps):")
    print(f"    Accuracy:  {accuracy_masked:.4f}")
    print(f"    Precision: {precision:.4f}")
    print(f"    Recall:    {recall:.4f}")
    print(f"    F1-score:  {f1:.4f}")

    try:
        target_names_list = list(SECONDARY_STRUCTURES) if 'SECONDARY_STRUCTURES' in globals() else [f'Class_{i}' for i in range(NUM_SECONDARY_STRUCTURES)]
        report = classification_report(y_true_flat_masked, y_pred_flat_masked, target_names=target_names_list, digits=4, zero_division=0)
        print("  Classification Report (on non-padded steps):\n", report)
    except Exception as report_err:
        print(f"  Could not generate classification report: {report_err}")
        report = "Error generating report"

    return {
        "loss": loss,
        "accuracy_keras": accuracy,
        "accuracy": accuracy_masked,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "report": report
    }

# --- Create directory for saving models (optional) ---
if not os.path.exists(MODEL_SAVE_DIR):
    try:
        os.makedirs(MODEL_SAVE_DIR)
        print(f"Created directory for saving models: {MODEL_SAVE_DIR}")
    except OSError as e:
        print(f"Error creating directory {MODEL_SAVE_DIR}: {e}")

# --- Train and Evaluate Each Model ---
for model_name, model in models_to_train.items():
    print("\n" + "#"*60)
    print(f"# Training Model: {model.name}")
    print("#"*60)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=PATIENCE,
        verbose=1,
        restore_best_weights=True
    )
    callbacks_list = [early_stopping]

    history = model.fit(
        X_train, Y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_val, Y_val),
        callbacks=callbacks_list,
        verbose=1
    )

    print(f"\n--- Finished Training {model.name} ---")

    model_results = {}
    for dataset_name, (x_data, y_data) in datasets.items():
        if dataset_name == "Test" and (x_data is None or x_data.shape[0] == 0):
            print("\nSkipping evaluation on empty Test set.")
            model_results[dataset_name] = {"loss": None, "accuracy": None, "precision": None, "recall": None, "f1_score": None, "report": "Test set empty"}
            continue
        if y_data is None or y_data.shape[0] == 0:
            print(f"\nSkipping evaluation on {dataset_name} set due to empty Y data.")
            model_results[dataset_name] = {"loss": None, "accuracy": None, "precision": None, "recall": None, "f1_score": None, "report": f"{dataset_name} set Y empty"}
            continue
        model_results[dataset_name] = evaluate_model_performance(model, x_data, y_data, dataset_name)

    results[model_name] = model_results

# --- Report Final Metrics ---
print("\n" + "="*70)
print("Final Performance Summary")
print("="*70)

for model_name, model_results in results.items():
    print(f"\n--- {model_name} ---")
    for dataset_name, metrics in model_results.items():
        if 'accuracy' in metrics and metrics['accuracy'] is not None:
            print(f"  {dataset_name} Set:")
            print(f"    Accuracy:  {metrics['accuracy']:.4f} (on non-padded steps)")
            print(f"    Precision: {metrics['precision']:.4f}")
            print(f"    Recall:    {metrics['recall']:.4f}")
            print(f"    F1-score:  {metrics['f1_score']:.4f}")
            print(f"    Loss:      {metrics['loss']:.4f}")
        else:
            print(f"  {dataset_name} Set: Not Evaluated ({metrics.get('report', 'Reason unknown')})")
    print("-" * 40)

print("\n" + "="*70)
print("Detailed Classification Reports (on non-padded steps)")
print("="*70)
for model_name, model_results in results.items():
    print(f"\n--- {model_name} ---")
    for dataset_name, metrics in model_results.items():
        if 'report' in metrics and metrics['accuracy'] is not None:
            print(f"  {dataset_name} Set Report:")
            print(metrics['report'])
            print("-" * 20)
        elif 'report' in metrics:
            print(f"  {dataset_name} Set: Not Evaluated ({metrics['report']})")
            print("-" * 20)
    print("-" * 40)



Part II - Task 3: Train and Evaluate Models on Real Data
Successfully loaded data and models from previous parts.
Training data shape: X=(932, 1231, 20), Y=(932, 1231, 3)
Validation data shape: X=(234, 1231, 20), Y=(234, 1231, 3)
Test data shape: X=(124, 1231, 20), Y=(124, 1231, 3)

############################################################
# Training Model: cnn_model
############################################################
Epoch 1/18
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m589s[0m 6s/step - accuracy: 0.0860 - loss: 0.2841 - val_accuracy: 0.0827 - val_loss: 0.2104
Epoch 2/18
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - accuracy: 0.1100 - loss: 0.1794 - val_accuracy: 0.0822 - val_loss: 0.2104
Epoch 3/18
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 48ms/step - accuracy: 0.1161 - loss: 0.1707 - val_accuracy: 0.0940 - val_loss: 0.1998
Epoch 4/18
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 48ms/st