In [None]:
#STEP1: INSTALL BIOPYTHON, TENSORFLOW-KERAS, SCIKIT-LEARN 

In [1]:
#STEP 2: IMPORT LIBRARIES

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
#STEP 3: LOAD SEQUENCES

def load_sequences(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    sequences = []
    labels = []
    current_sequence = []
    current_label = None

    for line in lines:
        line = line.strip()
        if line.startswith('>'):
            if current_sequence:
                sequences.append(''.join(current_sequence))
                labels.append(current_label)
                current_sequence = []
            current_label = line[1:]  # Remove the '>'
        else:
            current_sequence.append(line)

    # Add the last sequence
    if current_sequence:
        sequences.append(''.join(current_sequence))
        labels.append(current_label)

    return sequences, labels

file_path = r'()' #Enter your file_path
sequences, labels = load_sequences(file_path)

# Convert sequences to a dataframe
data = pd.DataFrame({'sequence': sequences, 'label': labels})
data.head()

In [None]:
#STEP 4: UNIQUE CHARACTERS

#Extract unique characters (nuleotides) present in sequences

unique_chars = set(''.join(sequences))

#Display unique characters
print("Unique Charactrs:", unique_chars)

In [None]:
#STEP 5: MAPPING

#Create mappings from characters to numerical values and vice versa

char_to_int = {char: i for i, char in enumerate(sorted(unique_chars))}
int_to_char = {i: char for char, i in char_to_int.items()}

#Display mappings
print("Character to Integer Mapping:", char_to_int )
print("Integer to Character Mapping:", int_to_char)


In [5]:
#STEP 6:  ENCODE THE LABELS

label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['label'])

# Get the number of classes
num_classes = len(label_encoder.classes_)

In [6]:
#STEP 7: CREATE A TOKENIZER AND FIT IT ON THE SEQUENCES

tokenizer = Tokenizer(char_level=True)  # Character-level tokenization
tokenizer.fit_on_texts(data['sequence'])

# Convert sequences to integer format
X = tokenizer.texts_to_sequences(data['sequence'])
max_seq_length = max([len(seq) for seq in X])  # Get the max sequence length
X = pad_sequences(X, maxlen=max_seq_length, padding='post')

# Convert labels to categorical format
y = to_categorical(data['encoded_label'], num_classes=num_classes)


In [7]:
#STEP 8: SPLIT THE DATASET INTO TRAINING AND VALIDATION SETS

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
#STEP 9: DEFINE MODEL PARAMETERS (MODEL ARCHITECHTURE)
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [None]:
#STEP 10: TRAIN THE MODEL

history = model.fit(X_train, y_train, 
                    epochs=30, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=2,
                    callbacks=[early_stopping])


In [None]:
#STEP 11: CHECK THE ACCURACY ON TESTING SET

loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

# Print the accuracy of the model on the testing set
print(f"Accuracy on Testing Set: {accuracy * 100:.2f}%")

In [None]:
#STEP 12: PREDICT UNKNOWN SEQUENCES

def preprocess_sequence(sequence, tokenizer, max_seq_length):
    sequence = tokenizer.texts_to_sequences([sequence])
    sequence = pad_sequences(sequence, maxlen=max_seq_length, padding='post')
    return sequence

def predict_protein_type(sequence, model, tokenizer, max_seq_length, label_encoder):
    processed_sequence = preprocess_sequence(sequence, tokenizer, max_seq_length)
    prediction = model.predict(processed_sequence)
    predicted_index = np.argmax(prediction)
    predicted_label = label_encoder.inverse_transform([predicted_index])
    return predicted_label[0]

# Example usage
#Enter the unknown sequence to predict
new_sequence = '' 
predicted_type = predict_protein_type(new_sequence, model, tokenizer, max_seq_length, label_encoder)
print("Predicted protein type:", predicted_type)


In [None]:
#STEP 12: INTERACTIVE INTERFACE FOR PREDICTING TYPE OF PROTEIN SEQUENCE

def interactive_prediction(model, tokenizer, max_seq_length, label_encoder):
    while True:
        user_input = input("Do you want to predict a protein sequence? (yes/no): ").strip().lower()
        if user_input == 'yes':
            new_sequence = input("Please enter the protein sequence: ").strip()
            predicted_type = predict_protein_type(new_sequence, model, tokenizer, max_seq_length, label_encoder)
            print("Predicted protein type:", predicted_type)
        elif user_input == 'no':
            print("Thank you for using the service!")
            break
        else:
            print("Invalid input. Please enter 'yes' or 'no'.")

# Call the interactive prediction function
interactive_prediction(model, tokenizer, max_seq_length, label_encoder)
