In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Function to load sequences from a file
def load_sequences(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    sequences = []
    labels = []
    current_sequence = []
    current_label = None

    for line in lines:
        line = line.strip()
        if line.startswith('>'):
            if current_sequence:
                sequences.append(''.join(current_sequence))
                labels.append(current_label)
                current_sequence = []
            current_label = line[1:]  # Remove the '>'
        else:
            current_sequence.append(line)

    # Add the last sequence
    if current_sequence:
        sequences.append(''.join(current_sequence))
        labels.append(current_label)

    return sequences, labels

# Load the data
file_path = r'' # add filepath 
sequences, labels = load_sequences(file_path)

# Create DataFrame
data = pd.DataFrame({'sequence': sequences, 'label': labels})
print(data.head())


In [None]:
from Bio import SeqIO
import pandas as pd

# Define structural and non-structural protein types
structural_proteins = ['CORE', 'E1', 'E2']
non_structural_proteins = ['NS2', 'NS3', 'NS4A', 'NS4B', 'NS5A', 'NS5B']

# Initialize lists to store data
sequences = []
labels = []
broad_labels = []

# Parse FASTA file
with open(r'', 'r') as fasta_file:                                            #Add file path
    for record in SeqIO.parse(fasta_file, 'fasta'):
        # Extract sequence and type from header
        sequence = str(record.seq)
        protein_type = record.description.split()[0]  # Assuming protein type is in the header after the sequence ID
        
        # Assign broad labels (Structural vs Non-Structural)
        if protein_type in structural_proteins:
            broad_label = 'Structural'
        elif protein_type in non_structural_proteins:
            broad_label = 'Non-Structural'
        else:
            broad_label = 'Unknown'  # Handle unexpected types

        # Append sequence and labels
        sequences.append(sequence)
        labels.append(protein_type)  # Specific protein label (e.g., 'Core', 'NS5A')
        broad_labels.append(broad_label)  # Broad label (e.g., 'Structural', 'Non-Structural')

# Create a DataFrame for easier manipulation
df = pd.DataFrame({
    'sequence': sequences,
    'label': labels,
    'broad_label': broad_labels
})

# Preview the DataFrame
print(df.head())


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode broad labels (Structural, Non-Structural)
broad_label_encoder = LabelEncoder()
df['broad_label_encoded'] = broad_label_encoder.fit_transform(df['broad_label'])

# Encode specific protein types (Core, NS5A, etc.)
protein_label_encoder = LabelEncoder()
df['label_encoded'] = protein_label_encoder.fit_transform(df['label'])

# Print some encoded values for verification
print(f"Sample encoded labels: {df['label_encoded'].head().tolist()}")
print(f"Sample hierarchical labels: {df['broad_label_encoded'].head().tolist()}")


In [None]:
from tensorflow.keras.utils import to_categorical

# One-hot encoding of specific protein labels
df['label_one_hot'] = list(to_categorical(df['label_encoded']))

# One-hot encoding of broad labels
df['broad_label_one_hot'] = list(to_categorical(df['broad_label_encoded']))

print(f"Sample one-hot encoded labels: {df['label_one_hot'].head()}")
print(f"Sample one-hot encoded broad labels: {df['broad_label_one_hot'].head()}")


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Encode sequences into integer values
amino_acid_mapping = {aa: idx for idx, aa in enumerate(set(''.join(sequences)))}
sequences_encoded = [[amino_acid_mapping[aa] for aa in seq] for seq in sequences]

# Pad sequences
max_len = max(len(seq) for seq in sequences_encoded)
X = pad_sequences(sequences_encoded, maxlen=max_len, padding='post')

# Convert to numpy array for TensorFlow compatibility
X = np.array([to_categorical(seq, num_classes=len(amino_acid_mapping)) for seq in X])

# Prepare labels
y = np.array(df['label_one_hot'].tolist())
y_broad = np.array(df['broad_label_one_hot'].tolist())

# Split Data
X_train, X_test, y_train, y_test, y_broad_train, y_broad_test = train_test_split(
    X, y, y_broad, test_size=0.2, random_state=42
)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.utils import to_categorical

# Encode sequences into integer values
amino_acid_mapping = {aa: idx for idx, aa in enumerate(set(''.join(sequences)))}
sequences_encoded = [[amino_acid_mapping[aa] for aa in seq] for seq in sequences]

# Pad sequences
max_len = max(len(seq) for seq in sequences_encoded)
X = pad_sequences(sequences_encoded, maxlen=max_len, padding='post')

# Convert to one-hot encoding
X = np.array([to_categorical(seq, num_classes=len(amino_acid_mapping)) for seq in X])

# Prepare labels
y = np.array(df['label_one_hot'].tolist())
y_broad = np.array(df['broad_label_one_hot'].tolist())

# Split Data
X_train, X_test, y_train, y_test, y_broad_train, y_broad_test = train_test_split(
    X, y, y_broad, test_size=0.2, random_state=42
)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Training broad labels shape: {y_broad_train.shape}")
print(f"Testing broad labels shape: {y_broad_test.shape}")
print(f"Training detailed labels shape: {y_train.shape}")
print(f"Testing detailed labels shape: {y_test.shape}")


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

# Define the model inputs
input_seq = Input(shape=(max_len, len(amino_acid_mapping)))

# Define LSTM layers
x = LSTM(256, return_sequences=True)(input_seq)
x = LSTM(256)(x)

# Define the two outputs
broad_output = Dense(len(broad_label_encoder.classes_), activation='softmax', name='broad_output')(x)
detailed_output = Dense(len(protein_label_encoder.classes_), activation='softmax', name='detailed_output')(x)

# Create the model
model = Model(inputs=input_seq, outputs=[broad_output, detailed_output])

# Compile the Model
model.compile(optimizer='adam',
              loss={'broad_output': 'categorical_crossentropy', 'detailed_output': 'categorical_crossentropy'},
              metrics={'broad_output': 'accuracy', 'detailed_output': 'accuracy'})

model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # What to monitor
    patience=3,  # Number of epochs with no improvement to wait
    restore_best_weights=True  # Restore the weights of the best model
)

# Train the Model
history = model.fit(
    X_train,
    {'broad_output': y_broad_train, 'detailed_output': y_train},
    epochs=10,  # Adjust the number of epochs as needed
    batch_size=32,  # Adjust the batch size as needed
    validation_split=0.2,  # Use 20% of training data for validation
    callbacks=[early_stopping]  # Add the EarlyStopping callback
)


In [None]:
# Evaluate the model
loss, broad_loss, detailed_loss, broad_acc, detailed_acc = model.evaluate(
    X_test,
    {'broad_output': y_broad_test, 'detailed_output': y_test}
)

print(f'Broad classification loss: {broad_loss}')
print(f'Detailed classification loss: {detailed_loss}')
print(f'Broad classification accuracy: {broad_acc}')
print(f'Detailed classification accuracy: {detailed_acc}')


In [None]:
from sklearn.metrics import classification_report

# Predict probabilities for test set
y_broad_pred_prob = model.predict(X_test)[0]  # Probabilities for broad labels
y_detailed_pred_prob = model.predict(X_test)[1]  # Probabilities for detailed labels

# For binary classification
print("Classification report for broad labels:")
print(classification_report(np.argmax(y_broad_test, axis=1), np.argmax(y_broad_pred_prob, axis=1), target_names=broad_label_encoder.classes_))

print("Classification report for detailed labels:")
print(classification_report(np.argmax(y_test, axis=1), np.argmax(y_detailed_pred_prob, axis=1), target_names=protein_label_encoder.classes_))


In [None]:
#CROSS VALIDATION 
from sklearn.model_selection import KFold
import numpy as np

# Set up k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # Using 5-fold cross-validation

# Initialize lists to store results
fold_accuracy = []
fold_broad_accuracy = []
fold_detailed_accuracy = []

# Iterate through each fold
fold_num = 1
for train_index, val_index in kf.split(X):
    print(f"\nTraining Fold {fold_num}...")

    # Split data into training and validation sets
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    y_broad_train_fold, y_broad_val_fold = y_broad[train_index], y_broad[val_index]

    # Define the model (reinitialize for each fold)
    input_seq = Input(shape=(max_len, len(amino_acid_mapping)))
    x = LSTM(256, return_sequences=True)(input_seq)
    x = LSTM(256)(x)
    broad_output = Dense(len(broad_label_encoder.classes_), activation='softmax', name='broad_output')(x)
    detailed_output = Dense(len(protein_label_encoder.classes_), activation='softmax', name='detailed_output')(x)
    model = Model(inputs=input_seq, outputs=[broad_output, detailed_output])
    model.compile(optimizer='adam',
                  loss={'broad_output': 'categorical_crossentropy', 'detailed_output': 'categorical_crossentropy'},
                  metrics={'broad_output': 'accuracy', 'detailed_output': 'accuracy'})

    # Train the model for the current fold
    history = model.fit(
        X_train_fold,
        {'broad_output': y_broad_train_fold, 'detailed_output': y_train_fold},
        epochs=10,  # Adjust the number of epochs
        batch_size=32,  # Adjust batch size if needed
        validation_data=(X_val_fold, {'broad_output': y_broad_val_fold, 'detailed_output': y_val_fold}),
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate the model on the validation set for this fold
    scores = model.evaluate(X_val_fold, {'broad_output': y_broad_val_fold, 'detailed_output': y_val_fold}, verbose=0)

    
    # Print and store accuracy results
    print(f"Fold {fold_num} - Broad classification accuracy: {scores[3]:.4f}")
    print(f"Fold {fold_num} - Detailed classification accuracy: {scores[4]:.4f}")

    # Store accuracy results
    fold_broad_accuracy.append(scores[3])  # Broad classification accuracy
    fold_detailed_accuracy.append(scores[4])  # Detailed classification accuracy
    fold_num += 1

# Print average accuracy across all folds
print("\nCross-validation results:")
print(f"Average Broad Classification Accuracy: {np.mean(fold_broad_accuracy):.4f}")
print(f"Average Detailed Classification Accuracy: {np.mean(fold_detailed_accuracy):.4f}")



In [None]:
import numpy as np

def predict_protein_type(sequence, model, amino_acid_mapping, max_seq_length, broad_label_encoder, protein_label_encoder):
    # Preprocess the sequence
    processed_sequence = preprocess_sequence(sequence, max_seq_length, amino_acid_mapping)
    
    # Ensure the shape of the processed sequence matches the expected input shape of the model
    expected_shape = (1, max_seq_length, len(amino_acid_mapping))
    if processed_sequence.shape != expected_shape:
        raise ValueError(f"Expected input shape {expected_shape} but got {processed_sequence.shape}")
    
    # Make predictions
    broad_pred, detailed_pred = model.predict(processed_sequence)
    
    # Get the index of the maximum probability for broad and detailed predictions
    broad_pred_index = np.argmax(broad_pred, axis=1)[0]
    detailed_pred_index = np.argmax(detailed_pred, axis=1)[0]
    
    # Validate indices against label encoders
    if broad_pred_index >= len(broad_label_encoder.classes_):
        raise ValueError(f"Broad prediction index {broad_pred_index} is out of bounds for encoder classes.")
    if detailed_pred_index >= len(protein_label_encoder.classes_):
        raise ValueError(f"Detailed prediction index {detailed_pred_index} is out of bounds for encoder classes.")
    
    # Get the corresponding labels
    broad_pred_label = broad_label_encoder.inverse_transform([broad_pred_index])[0]
    detailed_pred_label = protein_label_encoder.inverse_transform([detailed_pred_index])[0]
    
    return broad_pred_label, detailed_pred_label

# Define max_seq_length and amino_acid_mapping
max_seq_length = 261  # Adjust based on your training data
amino_acid_mapping = {aa: idx for idx, aa in enumerate(set(''.join(sequences)))} # Replace with your actual mapping

# Example usage
new_sequence = ''
broad_pred_label, detailed_pred_label = predict_protein_type(
    new_sequence, model, amino_acid_mapping, max_seq_length, broad_label_encoder, protein_label_encoder
)

print("Predicted broad label:", broad_pred_label)
print("Predicted detailed label:", detailed_pred_label)
