In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

In [2]:
# Membaca file CSV
data = pd.read_csv('toxin_dataset.csv')

# Memisahkan fitur (sequence) dan label
sequences = data['seq']
labels = data['label']

seq_length = max(len(seq) for seq in sequences)

# Mengubah label menjadi angka
label_mapping = {label: index for index, label in enumerate(labels.unique())}
labels = labels.map(label_mapping)

# Membagi data menjadi data latih dan data uji
sequences_train, sequences_test, labels_train, labels_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)


In [3]:
alphabet = 'ACDEFGHIKLMNPQRSTVWY'  # Hanya huruf-huruf asam amino yang relevan
num_features = len(alphabet)

In [4]:
def one_hot_encoding(sequence):
    encoding = []
    for char in sequence:
        vector = [0] * num_features
        if char in alphabet:
            index = alphabet.index(char)
            vector[index] = 1
        encoding.append(vector)
    return encoding

In [5]:
sequences_train = [one_hot_encoding(seq) for seq in sequences_train]
sequences_test = [one_hot_encoding(seq) for seq in sequences_test]

In [6]:
sequences_train = [seq + [[0] * num_features] * (seq_length - len(seq)) for seq in sequences_train]
sequences_test = [seq + [[0] * num_features] * (seq_length - len(seq)) for seq in sequences_test]

In [7]:
# Konversi ke array numpy
sequences_train = np.array(sequences_train)
sequences_test = np.array(sequences_test)

In [8]:
# Membangun model CNN
model = Sequential()
model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(seq_length, num_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
model.fit(sequences_train, labels_train, epochs=10, batch_size=64, validation_data=(sequences_test, labels_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23c100211d0>

In [11]:
# Evaluasi model
_, accuracy = model.evaluate(sequences_test, labels_test)
print('Accuracy:', accuracy)

# Simpan model ke file .h5
model.save('toxin.h5')
print("Model telah disimpan ke file model.h5")

Accuracy: 0.9388889074325562
Model telah disimpan ke file model.h5


In [12]:
# Tampilkan hasil perubahan label
label_mapping_reverse = {v: k for k, v in label_mapping.items()}
labels_train_original = labels_train.map(label_mapping_reverse)
labels_train_predicted = model.predict_classes(sequences_train).flatten()
labels_train_predicted_original = [label_mapping_reverse[label] for label in labels_train_predicted]

for original, predicted in zip(labels_train_original, labels_train_predicted_original):
    print(f'Original: {original}, Predicted: {predicted}')

AttributeError: 'Sequential' object has no attribute 'predict_classes'