# Make data ready

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional, GRU
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import numpy as np

train_data = pd.read_csv("less_than_400_train_data.csv")
test_data = pd.read_csv("less_than_400_test_data.csv")

max_seq_length = max(max(train_data['len']), max(test_data['len']))

seq_tokenizer = Tokenizer(char_level=True)
sst3_tokenizer = Tokenizer(char_level=True)
sst8_tokenizer = Tokenizer(char_level=True)

seq_tokenizer.fit_on_texts(train_data['seq'])
sst3_tokenizer.fit_on_texts(train_data['sst3'])
sst8_tokenizer.fit_on_texts(train_data['sst8'])

encoded_seqs_train = seq_tokenizer.texts_to_sequences(train_data['seq'])
encoded_sst3_train = sst3_tokenizer.texts_to_sequences(train_data['sst3'])
encoded_sst8_train = sst8_tokenizer.texts_to_sequences(train_data['sst8'])

encoded_seqs_test = seq_tokenizer.texts_to_sequences(test_data['seq'])
encoded_sst3_test = sst3_tokenizer.texts_to_sequences(test_data['sst3'])
encoded_sst8_test = sst8_tokenizer.texts_to_sequences(test_data['sst8'])

padded_seqs_train = pad_sequences(encoded_seqs_train, maxlen=max_seq_length, padding='post', truncating='post')
padded_sst3_train = pad_sequences(encoded_sst3_train, maxlen=max_seq_length, padding='post', truncating='post')
padded_sst8_train = pad_sequences(encoded_sst8_train, maxlen=max_seq_length, padding='post', truncating='post')

padded_seqs_test = pad_sequences(encoded_seqs_test, maxlen=max_seq_length, padding='post', truncating='post')
padded_sst3_test = pad_sequences(encoded_sst3_test, maxlen=max_seq_length, padding='post', truncating='post')
padded_sst8_test = pad_sequences(encoded_sst8_test, maxlen=max_seq_length, padding='post', truncating='post')

vocab_size = len(seq_tokenizer.word_index) + 1
output_dim_sst3 = len(sst3_tokenizer.word_index) + 1
output_dim_sst8 = len(sst8_tokenizer.word_index) + 1

In [None]:
import pickle

with open('amino_acid_tokenizer.pickle', 'wb') as handle:
    pickle.dump(seq_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('sst3_tokenizer.pickle', 'wb') as handle:
    pickle.dump(sst3_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('sst8_tokenizer.pickle', 'wb') as handle:
    pickle.dump(sst8_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Models

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional, GRU, Dropout, Conv1D, GlobalMaxPooling1D

def create_model(output_dim):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=256, input_length=max_seq_length),
        # Bidirectional(LSTM(512, return_sequences=True)),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        TimeDistributed(Dense(output_dim, activation='softmax'))
    ])

    return model

model_sst3 = create_model(output_dim_sst3)
model_sst8 = create_model(output_dim_sst8)

def masked_accuracy(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.flatten(K.argmax(y_pred, axis=-1))

    y_true = K.cast(y_true, 'int16')
    y_pred = K.cast(y_pred, 'int16')

    mask = K.cast(K.not_equal(y_true, 0), 'float16')
    correct_predictions = K.cast(K.equal(y_true, y_pred), 'float16') * mask
    accuracy = K.sum(correct_predictions) / K.sum(mask)
    return accuracy

model_sst3 = create_model(output_dim_sst3)
model_sst8 = create_model(output_dim_sst8)

model_sst3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[masked_accuracy])
model_sst8.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[masked_accuracy])

history_sst3 = model_sst3.fit(padded_seqs_train, np.expand_dims(padded_sst3_train, -1), validation_data=(padded_seqs_test, np.expand_dims(padded_sst3_test, -1)), epochs=10, batch_size=128)
history_sst8 = model_sst8.fit(padded_seqs_train, np.expand_dims(padded_sst8_train, -1), validation_data=(padded_seqs_test, np.expand_dims(padded_sst8_test, -1)), epochs=10, batch_size=128)

sst3_loss, sst3_acc = model_sst3.evaluate(padded_seqs_test, np.expand_dims(padded_sst3_test, -1))
sst8_loss, sst8_acc = model_sst8.evaluate(padded_seqs_test, np.expand_dims(padded_sst8_test, -1))

print(f'SST3 Model Accuracy: {sst3_acc}')
print(f'SST8 Model Accuracy: {sst8_acc}')

In [None]:
import datetime

current_date_hour = datetime.datetime.now().strftime("%m%d%H")

model_sst3.save(f'sst3_model_{current_date_hour}_64es_dense.h5')
model_sst8.save(f'sst8_model_{current_date_hour}_64es_dense.h5')

# Plots

In [None]:
import matplotlib.pyplot as plt

plt.plot(history_sst3.history['loss'], label='Training Q3 Loss')
plt.plot(history_sst3.history['val_loss'], label='Test Q3 Loss')
plt.xlabel('Epochs')
plt.ylabel('Q3 Loss')
plt.title('Training and Test Q3 Loss')
plt.legend()

plt.savefig(f'sst3_training_validation_loss{current_date_hour}_dense.png')

plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(history_sst3.history['masked_accuracy'], label='Training Q3 Accuracy')
plt.plot(history_sst3.history['val_masked_accuracy'], label='Test Q3 Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Q3 Accuracy')
plt.title('Training and Test Q3 Accuracy')
plt.legend()

plt.savefig(f'sst3_training_validation_accuracy{current_date_hour}_dense.png')

plt.show()

In [None]:
plt.plot(history_sst8.history['loss'], label='Training Q8 Loss')
plt.plot(history_sst8.history['val_loss'], label='Test Q8 Loss')
plt.xlabel('Epochs')
plt.ylabel('Q8 Loss')
plt.title('Training and Test Q8 Loss')
plt.legend()

plt.savefig(f'sst8_training_validation_loss{current_date_hour}_dense.png')

plt.show()

In [None]:
plt.plot(history_sst8.history['masked_accuracy'], label='Training Q8 Accuracy')
plt.plot(history_sst8.history['val_masked_accuracy'], label='Test Q8 Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Q8 Accuracy')
plt.title('Training and Test Q8 Accuracy')
plt.legend()

plt.savefig(f'sst8_training_validation_accuracy{current_date_hour}_dense.png')

plt.show()