In [17]:
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import Dense
from keras.layers import Conv2D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import MaxPool2D
from keras.models import Sequential
from keras.optimizers import adam_v2
from keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.utils import to_categorical

In [3]:
train_set = pd.read_csv('/content/drive/MyDrive/Bio/training_set.csv')
dev_set = pd.read_csv('/content/drive/MyDrive/Bio/development_set.csv')
test_set = pd.read_csv('/content/drive/MyDrive/Bio/test_set.csv')

In [4]:
# K_Mers
k = 7
dnaLetters = ['A', 'T', 'G', 'C']
allSubString = []

def generateString(dnaLetters, prefix, k):

  n = len(dnaLetters)
  if (k == 0):
    allSubString.append(prefix)
    return 
  
  for l in dnaLetters:
    newPrefix = prefix + l
    generateString(dnaLetters, newPrefix, k-1)

generateString(dnaLetters, "", k)
subDict =  {allSubString[i] : i for i in range(0, len(allSubString))}

In [5]:
def generate_X(data_set):
  X_input = np.zeros((len(data_set), len(allSubString)))
  for i in range(len(data_set)):
    seq = data_set[i]
    for j in range(0, len(seq) - k + 1):
      X_input[i][subDict[seq[j:j+k]]] += 1

  X_input = X_input.astype('float32')
  return X_input

In [6]:
def generate_Y(data_set):
  Y_input = data_set.apply(lambda x : int(x[5]) - 1).to_numpy()
  Y_input = to_categorical(Y_input)
  return Y_input

In [23]:
X_train = generate_X(train_set['Sequence'])
Y_train = generate_Y(train_set['Type'])

X_dev = generate_X(dev_set['Sequence'])
Y_dev = generate_Y(dev_set['Type'])

X_test = generate_X(test_set['Sequence'])

In [24]:
MLP_model = Sequential()
MLP_model.add(Dense(80, input_dim=len(allSubString), activation='relu'))
# MLP_model.add(Dropout(0.05))
MLP_model.add(Dense(50, activation = 'relu'))
# MLP_model.add(Dropout(0.05))
MLP_model.add(Dense(6, activation='softmax'))

In [25]:
MLP_model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
history = MLP_model.fit(X_train, Y_train, epochs=20, batch_size=50, verbose=1, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
MLP_model.summary()
print("\n")
test = MLP_model.evaluate(X_dev, Y_dev, verbose=1)
print("\n------------------------------------------------------------------")
print(MLP_model.predict(X_dev))
print("\n------------------------------------------------------------------")
result = (MLP_model.predict(X_test).argmax(axis=-1) + 1)
print(result)


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 80)                1310800   
                                                                 
 dense_10 (Dense)            (None, 50)                4050      
                                                                 
 dense_11 (Dense)            (None, 6)                 306       
                                                                 
Total params: 1,315,156
Trainable params: 1,315,156
Non-trainable params: 0
_________________________________________________________________



------------------------------------------------------------------
[[5.4295820e-06 1.0355825e-09 1.1050500e-12 1.2913298e-06 9.9999166e-01
  1.6818009e-06]
 [6.4401995e-09 1.0000000e+00 1.3589282e-18 7.4523461e-24 8.4879861e-17
  9.8136676e-21]
 [5.1949609e-09 1.0000000e+00 1.1868981e-18 6.1522008e-24 5.

In [15]:
CNN_model = Sequential([

      Conv2D(32, (3, 3), padding='same', kernel_initializer='he_uniform', activation='relu', input_shape=(50, 1320, 4**7)),
      Conv2D(32, (3, 3), padding='same', kernel_initializer='he_uniform', activation='relu'),
      MaxPool2D((2, 2)),
      Dropout(0.15),

      Conv2D(64, (3, 3), padding='same', kernel_initializer='he_uniform', activation='relu'),
      Conv2D(64, (3, 3), padding='same', kernel_initializer='he_uniform', activation='relu'),
      MaxPool2D((2, 2)),
      Dropout(0.15),

      Conv2D(128, (3, 3), padding='same', kernel_initializer='he_uniform', activation='relu'),
      Conv2D(128, (3, 3), padding='same', kernel_initializer='he_uniform', activation='relu'),
      MaxPool2D((2, 2)),
      Dropout(0.15),
      
      Flatten(),
      Dense(128, kernel_initializer='he_uniform', activation='relu'),
      Dropout(0.15),
      Dense(6, kernel_initializer='he_uniform', activation='softmax')
])

In [None]:
CNN_model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
history = CNN_model.fit(train_data, Y_train, epochs=20, batch_size=50)