In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE

In [None]:
def datasetLoad(posFile, negFile):
  x_train = np.concatenate((np.load(posFile), np.load(negFile)), axis = 0)
  return x_train

In [None]:
#location
kmer_nac_x = datasetLoad("/content/drive/My Drive/fasta dataset/A.thaliana5289_pos/kmer1/A.thaliana5289_pos_kmer_nac_1.npy","/content/drive/My Drive/fasta dataset/A.thaliana5289_neg/kmer1/A.thaliana5289_neg_kmer_nac.npy")
kmer_dnc_x = datasetLoad("/content/drive/My Drive/fasta dataset/A.thaliana5289_pos/kmer2/A.thaliana5289_pos_kmer_dnc_2.npy","/content/drive/My Drive/fasta dataset/A.thaliana5289_neg/kmer2/A.thaliana5289_neg_kmer_dnc.npy")
kmer_tnc_x = datasetLoad("/content/drive/My Drive/fasta dataset/A.thaliana5289_pos/kmer3/A.thaliana5289_pos_kmer_tnc_3.npy","/content/drive/My Drive/fasta dataset/A.thaliana5289_neg/kmer3/A.thaliana5289_neg_kmer_tnc.npy")
enac_x = datasetLoad("/content/drive/My Drive/fasta dataset/A.thaliana5289_pos/enac/A.thaliana5289_pos_enac_5.npy","/content/drive/My Drive/fasta dataset/A.thaliana5289_neg/enac/A.thaliana5289_neg_enac_5.npy")
pseknc_x = datasetLoad("/content/drive/My Drive/fasta dataset/A.thaliana5289_pos/pseknc/A.thaliana5289_pos_pseknc.npy","/content/drive/My Drive/fasta dataset/A.thaliana5289_neg/pseknc/A.thaliana5289_neg_pseknc.npy") 
pcpsednc_x = datasetLoad("/content/drive/My Drive/fasta dataset/A.thaliana5289_pos/pcpsednc/A.thaliana5289_pos_pspsednc.npy","/content/drive/My Drive/fasta dataset/A.thaliana5289_neg/pcpsednc/A.thaliana5289_neg_pcpsednc.npy") 

y_train = np.concatenate((np.tile(1, 5289), np.tile(0, 5289)))

In [None]:
print(kmer_nac_x.shape)
print(kmer_dnc_x.shape)
print(kmer_tnc_x.shape)
print(enac_x.shape)
print(pseknc_x.shape)
print(pcpsednc_x.shape)
print(y_train.shape)

(10578, 4)
(10578, 16)
(10578, 64)
(10578, 148)
(10578, 66)
(10578, 18)
(10578,)


In [None]:
x_train = np.concatenate((kmer_nac_x, kmer_dnc_x), axis = 1)
x_train = np.concatenate((x_train, kmer_tnc_x), axis = 1)
x_train = np.concatenate((x_train, enac_x), axis = 1)
x_train = np.concatenate((x_train, pseknc_x), axis = 1)
x_train = np.concatenate((x_train, pcpsednc_x), axis = 1)

print(x_train.shape)

(10578, 316)


In [None]:
feature_group = {
    "nac": 0,
    "dnc": 0,
    "tnc": 0,
    "enac": 0,
    "pseknc": 0,
    "pcpsednc": 0
}

In [None]:
#32 feature
X_64_best= SelectKBest(chi2, k=32).fit(x_train, y_train)
mask = X_64_best.get_support()

count = 0
for x in mask:
  if x:
    if count >= 0 and count <= 3:
      feature_group["nac"] = feature_group["nac"] + 1
    elif count >= 4 and count <= 19:
      feature_group["dnc"] = feature_group["dnc"] + 1
    elif count >= 20 and count <= 83:
      feature_group["tnc"] = feature_group["tnc"] + 1
    elif count >= 84 and count <= 231:
      feature_group["enac"] = feature_group["enac"] + 1
    elif count >= 232 and count <= 297:
      feature_group["pseknc"] = feature_group["pseknc"] + 1
    elif count >= 298 and count <= 315:
      feature_group["pcpsednc"] = feature_group["pcpsednc"] + 1
  count = count + 1

print(feature_group)
sorted_feature_group= sorted(feature_group.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group)

{'nac': 0, 'dnc': 4, 'tnc': 7, 'enac': 9, 'pseknc': 8, 'pcpsednc': 4}
[('enac', 9), ('pseknc', 8), ('tnc', 7), ('dnc', 4), ('pcpsednc', 4), ('nac', 0)]


In [None]:
#part 2
x_train = np.concatenate((enac_x, pseknc_x), axis = 1)
print(x_train.shape)
print(y_train.shape)

(10578, 214)
(10578,)


In [None]:
feature_group_71 = { #1/3 of 214 features
    "enac": 0,
    "pseknc": 0
}
feature_group_43 = { #1/5 of 214 features
    "enac": 0,
    "pseknc": 0
}
feature_group_31 = { #1/7 of 214 feature
    "enac": 0,
    "pseknc": 0
}

In [None]:
#71 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 71, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected

count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 147:
      feature_group_71["enac"] = feature_group_71["enac"] + 1
    elif count >= 148 and count <= 213:
      feature_group_71["pseknc"] = feature_group_71["pseknc"] + 1
  count = count + 1

print(feature_group_71)
sorted_feature_group_71 = sorted(feature_group_71.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_71)

{'enac': 50, 'pseknc': 92}
[('pseknc', 92), ('enac', 50)]


In [None]:
#43 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 43, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected

count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 147:
      feature_group_43["enac"] = feature_group_43["enac"] + 1
    elif count >= 148 and count <= 213:
      feature_group_43["pseknc"] = feature_group_43["pseknc"] + 1
  count = count + 1

print(feature_group_43)
sorted_feature_group_43 = sorted(feature_group_43.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_43)

{'enac': 14, 'pseknc': 29}
[('pseknc', 29), ('enac', 14)]


In [None]:
#31 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 31, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected

count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 147:
      feature_group_31["enac"] = feature_group_31["enac"] + 1
    elif count >= 148 and count <= 213:
      feature_group_31["pseknc"] = feature_group_31["pseknc"] + 1
  count = count + 1

print(feature_group_31)
sorted_feature_group_31 = sorted(feature_group_31.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_31)

{'enac': 10, 'pseknc': 21}
[('pseknc', 21), ('enac', 10)]


In [None]:
#SVM cross validation

x_train = pseknc_x
x_test = datasetLoad("/content/drive/My Drive/fasta dataset/A.thaliana1000indep_pos/pseknc/A.thaliana1000indep_pos_pseknc.npy", "/content/drive/My Drive/fasta dataset/A.thaliana1000indep_neg/pseknc/A.thaliana1000indep_neg_pseknc.npy")
y_test = np.concatenate((np.tile(1, 1000), np.tile(0, 1000)))

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(10578, 66)
(10578,)
(2000, 66)
(2000,)


In [None]:
seed = 40
np.random.seed(seed)

kf = KFold(n_splits=5, random_state=seed, shuffle=True)

In [None]:
test_avg_list = []
C=0
for c in range(1,10000,100):
  test_avg = 0
  C = C+0.001
  print(f'for c = {c}:')

  clf = SVC(C=C,  random_state = seed)
  val_accuracy_list = []
  train_accuracy_list = []
  test_accuracy_list = []

  for train_index, val_index in kf.split(x_train):

    X_train, X_val = x_train[train_index], x_train[val_index]
    Y_train, Y_val = y_train[train_index], y_train[val_index]
    
    clf.fit(X_train, Y_train)

    y_val_pred = clf.predict(X_val)
    val_accuracy = accuracy_score(Y_val, y_val_pred)
    val_accuracy_list.append(val_accuracy)

    y_train_pred = clf.predict(X_train)
    train_accuracy = accuracy_score(Y_train, y_train_pred)
    train_accuracy_list.append(train_accuracy)

    y_test_pred = clf.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_avg = test_avg + test_accuracy
    test_accuracy_list.append(test_accuracy)

  test_avg = test_avg/(len(test_accuracy_list))
  test_avg_list.append(test_avg)

  for i in range(0, len(train_accuracy_list)):
    print("train_accuracy : ", train_accuracy_list[i], "\tVal accuracy : ", val_accuracy_list[i], "\tTest accuracy : ", test_accuracy_list[i])
  print(f'Avg test accuracy for that c is = {test_avg}')
  print()
print(max(test_avg_list))

for c = 1:
train_accuracy :  0.5011817537225243 	Val accuracy :  0.4952741020793951 	Test accuracy :  1.0
train_accuracy :  0.5003545261167572 	Val accuracy :  0.4985822306238185 	Test accuracy :  1.0
train_accuracy :  0.5033089104230678 	Val accuracy :  0.4867674858223062 	Test accuracy :  1.0
train_accuracy :  0.5053763440860215 	Val accuracy :  0.47848699763593383 	Test accuracy :  0.0
train_accuracy :  0.500531726338178 	Val accuracy :  0.4978723404255319 	Test accuracy :  1.0
Avg test accuracy for that c is = 0.8

for c = 101:
train_accuracy :  0.5011817537225243 	Val accuracy :  0.4952741020793951 	Test accuracy :  1.0
train_accuracy :  0.5003545261167572 	Val accuracy :  0.4985822306238185 	Test accuracy :  1.0
train_accuracy :  0.5033089104230678 	Val accuracy :  0.4867674858223062 	Test accuracy :  1.0
train_accuracy :  0.5053763440860215 	Val accuracy :  0.47848699763593383 	Test accuracy :  0.0
train_accuracy :  0.500531726338178 	Val accuracy :  0.4978723404255319 	Test acc