In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load embeddings from CSV
embedding_df = pd.read_csv("casp12_embeddings.csv")

# Assuming the first column is an identifier, extract numerical embeddings
X = embedding_df.iloc[:, 1:].values.astype(np.float32)

# Load labels
y = pd.read_csv("CASP12.csv")
y_q3 = y["dssp3"].to_list()

# Encode Q3 labels
q3_classes = ["H", "E", "C"]
q3_encoder = LabelEncoder()
q3_encoder.fit(q3_classes)
y_encoded = q3_encoder.transform(y_q3)
y_encoded = to_categorical(y_encoded, num_classes=len(q3_classes))  # One-hot encoding

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define the MLP model
model = Sequential([
    Input(shape=(1024,)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(3, activation='softmax')  # Output layer for Q3 classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Print classification report
print("Q3 Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, target_names=q3_classes))


Buffered data was truncated after reaching the output size limit.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# Load embeddings
with open("/content/drive/MyDrive/PSSP/ts115_embeddings.json", "r") as f:
    data = json.load(f)

# Load labels
y = pd.read_csv("TS115.csv")
y_q8 = y["dssp8"].to_list()

# Ensure embeddings match label length
all_embeddings_list = []
labels = []
for protein, label_seq in zip(data.values(), y_q8):
    trimmed_protein = protein[:len(label_seq)]  # Trim excess embeddings
    all_embeddings_list.extend(trimmed_protein)
    labels.extend(label_seq)

# Convert to NumPy arrays
X = np.array(all_embeddings_list, dtype=np.float32)

# Encode Q8 labels
q8_classes = ["H", "E", "G", "I", "B", "T", "S", "C"]
q8_encoder = LabelEncoder()
q8_encoder.fit(q8_classes)
y_encoded = q8_encoder.transform(labels)
y_encoded = to_categorical(y_encoded, num_classes=len(q8_classes))  # One-hot encoding

# Compute class weights for handling imbalance
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_encoded.argmax(axis=1)), y=y_encoded.argmax(axis=1))
class_weights_dict = {i: class_weights[i] for i in range(len(q8_classes))}

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define the improved MLP model
model = Sequential([
    Input(shape=(1024,)),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(16, activation='relu'),
    BatchNormalization(),

    Dense(8, activation='softmax')  # Output layer for Q8 classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Add EarlyStopping to stop training if val_loss increases for 3 epochs
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_test, y_test),
          class_weight=class_weights_dict, callbacks=[early_stopping])

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Print classification report
print("Q8 Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, target_names=q8_classes))


Epoch 1/30
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.1767 - loss: 2.2590 - val_accuracy: 0.3252 - val_loss: 1.8463
Epoch 2/30
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.2868 - loss: 1.8145 - val_accuracy: 0.3642 - val_loss: 1.6958
Epoch 3/30
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.3454 - loss: 1.7068 - val_accuracy: 0.4048 - val_loss: 1.6095
Epoch 4/30
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3855 - loss: 1.6160 - val_accuracy: 0.4587 - val_loss: 1.5019
Epoch 5/30
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.4172 - loss: 1.4747 - val_accuracy: 0.4590 - val_loss: 1.4730
Epoch 6/30
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.4475 - loss: 1.3911 - val_accuracy: 0.4992 - val_loss: 1.3757
Epoch 7/30
[1m372/372[0m