## Load Data

In [None]:
#Load Data 
#Terms for each protein fold
train_terms = pd.read_csv('/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv',sep='\t')
#Embeddings for each aminoacid_sequence
train_embeddings = np.load('/kaggle/input/t5embeds/train_embeds.npy')
#Protein ID's for the embeddings
train_id = np.load('/kaggle/input/t5embeds/train_ids.npy')

In [None]:
train_id.shape,train_embeddings.shape

In [None]:
# Convert embeddings numpy array(train_embeddings) into pandas dataframe.
column_num = train_embeddings.shape[1]
train_df = pd.DataFrame(train_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
train_df['ID'] = train_id

## Exploratory Analysis

In [None]:
train_terms

In [None]:
train_terms['term'].unique().shape

In [None]:
train_df

In [None]:
#Number of different proteins with the same aminoacid sequence embedding 
df_duplicated = train_df[train_df.loc[:, train_df.columns != 'ID'].duplicated(keep=False)]
#Flag the pairs 
df_duplicated['DuplicateGroup'] = df_duplicated.groupby([col for col in df_duplicated.columns]).ngroup()

In [None]:
print(df_duplicated.shape[0])

In [None]:
#Collect all terms of a EntryID inside one row through a list
df_duplicated_terms = train_terms[train_terms['EntryID'].isin(df_duplicated['ID'])].groupby('EntryID')['term'].apply(list).reset_index(name='terms_collected')
df_duplicated_terms['duplicated_sequence_group'] = df_duplicated_terms.merge(df_duplicated, left_on='EntryID', right_on='ID')['DuplicateGroup']

In [None]:
df_duplicated_terms

In [None]:
#Number of the same amnoacid sequence and all the same go-terms
df_duplicated_terms['terms_collected'] = df_duplicated_terms['terms_collected'].apply(tuple)
df_dp_count = df_duplicated_terms[df_duplicated_terms[['terms_collected','duplicated_sequence_group']].duplicated(keep=False)] #Count the number of proteins with same sequence and same go-terms

In [None]:
#Different proteins that have different functions although having the same sequence
df_dp_count

In [None]:
#Distribution betwen aspects
pie_df = train_terms['aspect'].value_counts()
palette_color = sns.color_palette('bright')
plt.pie(pie_df.values, labels=np.array(pie_df.index), colors=palette_color, autopct='%.0f%%')
plt.show()

In [None]:
from Bio import SeqIO

# Specify the path to your FASTA file
fasta_file = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta'

# Read the FASTA file
sequences = []
for record in SeqIO.parse(fasta_file, "fasta"):
    # Access the sequence ID and sequence data
    sequence_id = record.id
    sequence_data = record.seq

    # Add the sequence to the list
    sequences.append((sequence_id, sequence_data))

In [None]:
len(sequences), sequences[0],sequences[0][1]

## Preparing Data

In [None]:
# Set the limit for label
num_of_labels = 1500
train_size = train_id.shape[0] # len(X)

In [None]:
from keras.utils import pad_sequences

# Define the dictionary mapping for tokenization
amino_acid_dict = {
    'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5, 'Q': 6, 'E': 7, 'G': 8,
    'H': 9, 'I': 10, 'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15, 'S': 16,
    'T': 17, 'W': 18, 'Y': 19, 'V': 20, 'B': 21, 'Z': 22, 'X': 23, 'U': 24,
    'O': 25
}
amnoacid_sequences = []

# Define the maximum sequence length
max_sequence_length = 400

# Loop through each label
for sequence in sequences:
#     print(sequence[1])
    # Convert sequences to integer tokens
    tokenized_seq = [amino_acid_dict[aa] for aa in sequence[1]]
    
    amnoacid_sequences.append(tokenized_seq)
    
# Pad or truncate sequences to the desired length
amnoacid_sequences = pad_sequences(amnoacid_sequences, maxlen=max_sequence_length, padding='post', truncating='post', value=0)

In [None]:
amnoacid_sequences.shape

In [None]:
# Take value counts in descending order and fetch first 1500 `GO term ID` as labels
labels = train_terms['term'].value_counts().index[:num_of_labels].tolist()

# Fetch the train_terms data for the relevant labels only
train_terms_updated = train_terms.loc[train_terms['term'].isin(labels)]

# Setup progressbar settings.
bar = progressbar.ProgressBar(maxval=num_of_labels, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

# Create an empty dataframe of required size for storing the labels,
train_labels = np.zeros((train_size ,num_of_labels))
series_train_protein_ids = pd.Series(train_id)

# Loop through each label
for i in range(num_of_labels):
    # For each label, fetch the corresponding train_terms data
    n_train_terms = train_terms_updated[train_terms_updated['term'] ==  labels[i]]
    
    # Fetch all the unique EntryId aka proteins related to the current label(GO term ID)
    label_related_proteins = n_train_terms['EntryID'].unique()
    
    # In the series_train_protein_ids pandas series, if a protein is related
    # to the current label, then mark it as 1, else 0.
    # Replace the ith column of train_Y with with that pandas series.
    train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)
    
    # Progress bar percentage increase
    bar.update(i+1)

# Notify the end of progress bar 
bar.finish()

# Convert train_Y numpy into pandas dataframe
labels_df = pd.DataFrame(data = train_labels, columns = labels)
print(labels_df.shape)

In [None]:
# Extract input features and labels from the DataFrame
features_input = train_df.loc[:, train_df.columns != 'ID'].values  # Extract the values from the DataFrame
labels_input = labels_df.values  # Extract the label column


In [None]:
features_input

In [None]:
#Eval in test data 
test_embeddings = np.load('/kaggle/input/t5embeds/test_embeds.npy')

# Convert test_embeddings to dataframe
column_num = test_embeddings.shape[1]
test_df = pd.DataFrame(test_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
print(test_df.shape)

## Tryng a CNN-LSTM

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense
import tensorflow as tf

# Use MirroredStrategy for multi-GPU training
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    # Create a sequential model
    model_CNN_LSTM = Sequential()
    
    # Add a Conv1D layer for spatial pattern detection
    model_CNN_LSTM.add(Conv1D(32, kernel_size=3, input_shape=(1024, 1), activation='relu'))
    model_CNN_LSTM.add(MaxPooling1D(pool_size=2))
    
    # Add an LSTM layer for sequence modeling
    model_CNN_LSTM.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
    
    # Add another Dense layer for non-linear transformations
    model_CNN_LSTM.add(Dense(128, activation='relu'))

    # Add a fully connected layer for classification
    model_CNN_LSTM.add(Dense(1500, activation='softmax'))
    
    # Compile the model
    model_CNN_LSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define Early Stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=5,           # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore weights from the epoch with the best value of the monitored quantity
)

# Assuming you have training and validation data
# model_CNN_LSTM.fit(train_data, train_labels, validation_data=(val_data, val_labels), callbacks=[early_stopping], epochs=50)


In [None]:
import tensorflow as tf

# Check if GPU devices are available
physical_devices = tf.config.list_physical_devices('GPU')
num_gpus = len(physical_devices)
print("Number of available GPUs:", num_gpus)


In [None]:
if num_gpus < 2:
    print("Not enough GPUs available. Training on a single GPU.")
    history_CNN = model_CNN_LSTM.fit(features_input, labels_input, epochs=10, batch_size=1024)# Train the model with GPU acceleration
else:
    #Use MirroredStrategy for multi-GPU training
    with strategy.scope():
        history_CNN = model_CNN_LSTM.fit(features_input, labels_input, epochs=10, batch_size=1024)# Train the model with GPU acceleration

In [None]:
history_df = pd.DataFrame(history_CNN.history)
plt.subplots_adjust(wspace = 0.3, hspace = 0.3)
plt.figure(figsize=(10,10))

plt.subplot(2,2,1)
plt.plot(history_CNN.history['loss'])
plt.xlabel('epoch')
plt.ylabel('loss')

plt.subplot(2,2,2)
plt.plot(history_CNN.history['accuracy'])
plt.xlabel('epoch')
plt.ylabel('accuracy')

In [None]:
from tensorflow.keras.models import save_model
model_CNN_LSTM.save("model_CNN_LSTM_ReLU_softmax.h5")

In [None]:
from tensorflow.keras.models import load_model
model_CNN_LSTM = load_model("/kaggle/input/cafa5-test/model_CNN_LSTM_softmax.h5")

In [None]:
import numpy as np
import pandas as pd

test_embeddings = np.load('/kaggle/input/t5embeds/test_embeds.npy')

# Convert test_embeddings to dataframe
column_num = test_embeddings.shape[1]
test_df = pd.DataFrame(test_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
print(test_df.shape)

In [None]:
predictions =  model_CNN_LSTM.predict(test_df)

In [None]:
import gc
gc.collect()

In [None]:
predictions

In [None]:
np.save("predictions.npy", predictions)

In [None]:
# import numpy as np
# import pandas as pd

# predictions = np.load("/kaggle/input/cafa5-test/predictions.npy")
# test_protein_ids = np.load('/kaggle/input/t5embeds/test_ids.npy')
# train_terms = pd.read_csv('/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv',sep='\t')

# # Take value counts in descending order and fetch first 1500 `GO term ID` as labels
# num_of_labels = 1500
# labels = train_terms['term'].value_counts().index[:num_of_labels].tolist()

# df_submission = pd.DataFrame(columns = ['Protein Id', 'GO Term Id','Prediction'])

# l = []
# for k in list(test_protein_ids):
#     l += [ k] * predictions.shape[1]
    
# df_submission['Protein Id'] = l
# df_submission['GO Term Id'] = labels * predictions.shape[0]
# df_submission['Prediction'] = predictions.ravel()
# df_submission.to_csv("submission.tsv",header=False, index=False, sep="\t")