In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from scipy.sparse import csr_matrix
from sklearn.metrics import classification_report

from sklearn.utils import resample

def count_value_frequency(df, column):
    value_counts = df[column].value_counts()
    return value_counts

def balance_dataset(df, target_column):
    # Menghitung frekuensi nilai pada kolom target
    value_counts = count_value_frequency(df, target_column)

    # Menentukan jumlah minimum frekuensi nilai
    min_frequency = value_counts.min()

    # Melakukan resampling untuk setiap nilai dalam kolom target
    balanced_data = pd.DataFrame()
    for value in value_counts.index:
        # Mengambil subset data dengan nilai tertentu
        subset = df[df[target_column] == value]

        # Melakukan resampling dengan jumlah minimum frekuensi
        subset_resampled = resample(subset, replace=True, n_samples=min_frequency, random_state=42)

        # Menggabungkan subset resampled ke dalam dataset seimbang
        balanced_data = pd.concat([balanced_data, subset_resampled])

    return balanced_data
import numpy as np

def divide_dataset_by_float(df, float_value):
    num_samples = int(len(df) * float_value)
    return df.head(num_samples)

def divide_balanced_dataset(balanced_df, target_column, float_value):
    # Menghitung frekuensi nilai pada kolom target
    value_counts = count_value_frequency(balanced_df, target_column)

    # Menentukan jumlah dataset setelah dibagi sesuai float value
    divided_data = pd.DataFrame()
    for value in value_counts.index:
        # Mengambil subset data dengan nilai tertentu
        subset = balanced_df[balanced_df[target_column] == value]

        # Menghitung jumlah dataset setelah dibagi sesuai float value
        num_samples = int(len(subset) * float_value)
        
        # Memilih sebagian dari subset sesuai dengan float value
        subset_divided = subset.head(num_samples)
        
        # Menggabungkan subset yang sudah dibagi ke dalam dataset hasil
        divided_data = pd.concat([divided_data, subset_divided])

    return divided_data


In [2]:
# Membaca dataset dari file CSV
df = pd.read_csv('aro_categories_index(res_manage)2.csv')

# Mengacak dataset dan mereset indeks
df = df.sample(frac=1).reset_index(drop=True)

In [3]:
filtered_df = df[df['Protein(Asam Amino)'].apply(len) <= 500]
filtered_df = filtered_df[filtered_df['Resistance Mechanism'] != 'antibiotic target alteration']
filtered_df = filtered_df[filtered_df['Resistance Mechanism'] != 'antibiotic efflux']

In [4]:
len(filtered_df)

237

In [5]:
filtered_df['Resistance Mechanism'].value_counts()

Resistance Mechanism
antibiotic inactivation         141
antibiotic target protection     96
Name: count, dtype: int64

In [6]:
df = filtered_df

In [15]:
def reverse_dict(input_dict):
    reversed_dict = {value: key for key, value in input_dict.items()}
    return reversed_dict

codon_table = {
        'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
        'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
        'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
        'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
        'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
        'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
        'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
        'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
        'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
        'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
        'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
        'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
        'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
        'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
        'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
    }

amino_acid = list(set(list(reverse_dict(codon_table).keys())))

In [195]:
len(amino_acid)

21

In [196]:
def translate_dna_to_protein(dna_sequence):
    # Tabel kodon DNA untuk translasi
    codon_table = {
        'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
        'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
        'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
        'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
        'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
        'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
        'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
        'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
        'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
        'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
        'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
        'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
        'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
        'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
        'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
    }

    protein_sequence = []
    for i in range(0, len(dna_sequence), 3):
        codon = dna_sequence[i:i+3]
        if len(codon) == 3:
            protein_sequence.append(codon_table.get(codon, 'X'))

    return ''.join(protein_sequence)

In [197]:
# df['Protein(Asam Amino)'] = [translate_dna_to_protein(str(df['Nucleotide(DNA)'][i])) for i in range(len(df))]

In [198]:
# df.to_csv('aro_categories_index(res_manage)2.csv',index=False)

In [24]:
df['Protein(Asam Amino)'][0]

'SNSIL*VL*ILKHSYL*QALFLFQPAHLI**LLIQITALQNLMKKQRKLKIYLTKHTLRVS*LSSKGKLNKAMVMILLVLRPSMYLLRPSKCLML*SALSTIGQPPQKYLSGTGKKGYSQNGKRT*P*ATL*KLPLFRFIKI*LVVLDLNSCLMK*SVLVMAMQISVPKSIIFGWWVL*KLLLSKRHSLLTS*LIKRFHLAKKSKMKCNPCYS*KKRMEIKYTQKVVGDGM*THK*AG*LDGLFSLKEIL*RSPLT*K*KKEYLALFEKRLLIKV*NN*VFYRVSL*PSSLFTILIW'

In [200]:
df.columns

Index(['Protein Accession', 'DNA Accession', 'AMR Gene Family', 'Drug Class',
       'Resistance Mechanism', 'Nucleotide(DNA)', 'Protein(Asam Amino)'],
      dtype='object')

In [201]:
len(df)

164

In [202]:
df['Resistance Mechanism'].value_counts()

Resistance Mechanism
antibiotic target protection    92
antibiotic inactivation         72
Name: count, dtype: int64

In [203]:
174*0.04

6.96

In [7]:
df = balance_dataset(df, 'Resistance Mechanism')
float_value = 1.0  # Misalnya, kita ingin membagi setiap kelompok menjadi setengahnya

# Membagi dataset seimbang sesuai float value
df = divide_balanced_dataset(df, 'Resistance Mechanism', float_value)
df = df.reset_index()

In [8]:
df['Resistance Mechanism'].value_counts()

Resistance Mechanism
antibiotic inactivation         96
antibiotic target protection    96
Name: count, dtype: int64

In [9]:
df.columns

Index(['index', 'Protein Accession', 'DNA Accession', 'AMR Gene Family',
       'Drug Class', 'Resistance Mechanism', 'Nucleotide(DNA)',
       'Protein(Asam Amino)'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,index,Protein Accession,DNA Accession,AMR Gene Family,Drug Class,Resistance Mechanism,Nucleotide(DNA),Protein(Asam Amino)
0,510,WP_128268272.1,NG_063877.1,OXA beta-lactamase,carbapenem;cephalosporin;penam,antibiotic inactivation,TCAAATTCAATCTTATAAGTCTTATGAATATTAAAGCACTCTTACT...,SNSIL*VL*ILKHSYL*QALFLFQPAHLI**LLIQITALQNLMKKQ...
1,455,AAP82228.1,AY259119.1,SHV beta-lactamase,carbapenem;cephalosporin;penam,antibiotic inactivation,ATGAAAAATGATGAAGGAAAAAAGAGGAATTGTGAATCAGCAAAAC...,MKNDEGKKRNCESAKRRVILICRFFTRLYRPSLKDVLWLCVIFACV...
2,89,AAB21326.1,S81599.1,APH(3'),aminoglycoside antibiotic,antibiotic inactivation,GCCGGACCGGTAGCGGGTCCGCTCGTGGTCGGCGACGCGGAGTTCG...,AGPVAGPLVVGDAEFAPRPTPPRPTSPRTRRPP*APARRASSGRPP...
3,522,AJP67510.1,KP096411.1,GES beta-lactamase,carbapenem;cephalosporin;penam,antibiotic inactivation,ATGCGCTTCATTCACGCACTATTACTGGCAGCGATCGCTCACTCTG...,MRFIHALLLAAIAHSAYASEKLTFKTDLEKLEREKAAQIGVAIVDP...
4,361,WP_085562403.1,NG_054693.1,OXA beta-lactamase,carbapenem;cephalosporin;penam,antibiotic inactivation,ATTAAGCAAGGGGACGTTATGCGTGTATTAGCCTTATCGGCTGTGT...,IKQGDVMRVLALSAVFLVASIIGMPAVAREWQENKSWNVHFTEHKS...


In [11]:
df['Resistance Mechanism'].unique()

array(['antibiotic inactivation', 'antibiotic target protection'],
      dtype=object)

In [12]:
# Memisahkan fitur dan label
# sequences = df['Nucleotide(DNA)']
sequences = df['Protein(Asam Amino)']
labels = df['Resistance Mechanism']

In [210]:
all_len_seq = [len(df['Protein(Asam Amino)'][i]) for i in range(len(df))]

In [13]:
# Mengubah label menjadi bilangan bulat
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(sequences, y_encoded, test_size=0.2, random_state=42)

In [16]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix

# Mengubah sequence menjadi array angka dengan one-hot encoding
# alphabet = ['A', 'T', 'G', 'C']
alphabet = amino_acid
sequence_length = max(len(seq) for seq in sequences)
num_classes = len(label_encoder.classes_)

# def one_hot_encode(sequence):
#     encoded_sequence = lil_matrix((len(sequence), sequence_length * len(alphabet)), dtype=np.uint8)
#     for i, seq in enumerate(sequence):
#         for j, base in enumerate(seq):
#             if base in alphabet:
#                 encoded_sequence[i, j * len(alphabet) + alphabet.index(base)] = 1
#     return encoded_sequence

# # Mengubah format sparse matrix
# def to_csc_format(encoded_sequence):
#     # Menggunakan tocsr() untuk mengonversi ke Compressed Sparse Column format
#     encoded_sequence_csc = encoded_sequence.tocsr()
#     # Mengurutkan indeks agar urut sesuai urutan baris
#     encoded_sequence_csc.sort_indices()
#     return encoded_sequence_csc

def one_hot_encode(sequence):
    encoded_sequence = np.zeros((len(sequence), sequence_length, len(alphabet)), dtype=np.uint8)
    for i, seq in enumerate(sequence):
        for j, base in enumerate(seq):
            if base in alphabet:
                encoded_sequence[i, j, alphabet.index(base)] = 1
    return encoded_sequence


In [17]:
# X_train_encoded = one_hot_encode(X_train)
# X_train_encoded_csc = to_csc_format(X_train_encoded)

# X_test_encoded = one_hot_encode(X_test)
# X_test_encoded_csc = to_csc_format(X_test_encoded)

X_train_encoded = one_hot_encode(X_train)
X_test_encoded = one_hot_encode(X_test)


In [215]:
# import json

# X_train_dense = X_train_encoded.toarray().tolist()
# X_test_dense = X_test_encoded.toarray().tolist()

# # Create a dictionary to store the data and labels
# data_json = {
#     "X_train": X_train_dense,
#     "X_test": X_test_dense,
#     "y_train" : y_train,
#     "y_test" : y_test
# }

# # Convert the dictionary to JSON
# json_data = json.dumps(data_json, indent=4)

# # Save the JSON to a file
# with open('encoded_data.json', 'w') as json_file:
#     json_file.write(json_data)

In [216]:
print((sequence_length, len(alphabet)))

(300, 21)


In [217]:
sequence_length

300

In [218]:
num_classes

2

In [219]:
len(alphabet)

21

In [18]:
# Membuat model CNN
model = Sequential()
model.add(Conv1D(filters=len(alphabet), kernel_size=3, activation='relu', input_shape=(sequence_length, len(alphabet))))  # Reduced filters and sequence length
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=len(alphabet), kernel_size=3, activation='relu'))  # Reduced filters and sequence length
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=len(alphabet), kernel_size=3, activation='relu'))  # Reduced filters and sequence length
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=len(alphabet), kernel_size=3, activation='relu'))  # Reduced filters and sequence length
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(len(alphabet), activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 496, 21)           1344      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 248, 21)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 246, 21)           1344      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 123, 21)          0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 121, 21)           1344      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 60, 21)           0

In [20]:
# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
# Melatih model dengan data latih
model.fit(X_train_encoded, y_train, epochs=10, batch_size=2, validation_data=(X_test_encoded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1d380657c50>

In [22]:
# Membuat fungsi predict dengan kembalian seluruh label dan probabilitas
def predict_labels(sequence):
    sequence_encoded = np.zeros((1, sequence_length, len(alphabet)))
    for i, base in enumerate(sequence):
        sequence_encoded[0, i, alphabet.index(base)] = 1
    predictions = model.predict(sequence_encoded)
    labels = label_encoder.classes_
    probabilities = predictions[0]
    return labels, probabilities

In [25]:
# Contoh penggunaan fungsi predict
sequence = "SNSIL*VL*ILKHSYL*QALFLFQPAHLI**LLIQITALQNLMKKQRKLKIYLTKHTLRVS*LSSKGKLNKAMVMILLVLRPSMYLLRPSKCLML*SALSTIGQPPQKYLSGTGKKGYSQNGKRT*P*ATL*KLPLFRFIKI*LVVLDLNSCLMK*SVLVMAMQISVPKSIIFGWWVL*KLLLSKRHSLLTS*LIKRFHLAKKSKMKCNPCYS*KKRMEIKYTQKVVGDGM*THK*AG*LDGLFSLKEIL*RSPLT*K*KKEYLALFEKRLLIKV*NN*VFYRVSL*PSSLFTILIW"
labels, probabilities = predict_labels(sequence)
for label, probability in zip(labels, probabilities):
    print(f"{label}: {probability}")

antibiotic inactivation: 0.999998927116394
antibiotic target protection: 1.1004058251273818e-06


In [26]:
# Simpan model ke file .h5
model.save("aro_model.h5")