# **SVM Model**

In [None]:
import pandas as pd
import numpy as np
import joblib
import gc
from google.colab import drive
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
def clear_variable(var_list):
    for var in var_list:
        globals().pop(var, None)  # Remove from global scope
    gc.collect() # Run garbage collection

In [None]:
file_list = []

for i in range(20):
  file_list.append(f'Shuffled_Subset{i}.h5')

In [None]:
# Load the saved IncrementalPCA object
loaded_pca_32 = joblib.load('/content/drive/MyDrive/ML_Models/Inc_PCA_n32_Covid.pkl')
loaded_pca_64 = joblib.load('/content/drive/MyDrive/ML_Models/Inc_PCA_n64_Covid.pkl')
loaded_pca_128 = joblib.load('/content/drive/MyDrive/ML_Models/Inc_PCA_n128_Covid.pkl')
loaded_pca_256 = joblib.load('/content/drive/MyDrive/ML_Models/Inc_PCA_n256_Covid.pkl')

## **Gridsearch**

### **PCA 32**

In [None]:
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []

for i in range(len(file_list)):
  file_path = '/content/drive/MyDrive/ML_DL_Datasets/DNA_Datasets/Shuffled_Datasets/Covid_Shuffled_Balanced/Shuffled_Subset{file_counter}.h5'.format(file_counter = i+1)
  read_data = pd.read_hdf(file_path) # Read the current dataset

  data_reshaped = np.array(read_data.drop('Class', axis=1))
  data_labels = read_data['Class']
  clear_variable('read_data')

  X_train_list.append(data_reshaped[:800])
  X_test_list.append(data_reshaped[800:])
  clear_variable('data_reshaped')

  y_train_list.append(data_labels[:800])
  y_test_list.append(data_labels[800:])
  clear_variable('data_labels')

X_train = np.concatenate(X_train_list, axis=0)
clear_variable('X_train_list')

X_test = np.concatenate(X_test_list, axis=0)
clear_variable('X_test_list')

y_train = np.concatenate(y_train_list, axis=0)
clear_variable('y_train_list')

y_test = np.concatenate(y_test_list, axis=0)
clear_variable('y_test_list')

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_32.transform(X_train_batch)) # Transform the X_train with PCA

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list', 'X_train'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_32.transform(X_test_batch)) # Transform the X_test with PCA

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list', 'X_test'])

In [None]:
param_grid = {'C': [800,1200,1800,2500,5000,7500,10000],
              'gamma':[0.3,0.1,0.07,0.03,0.01,0.003]}

grid = GridSearchCV(SVC(),
                    param_grid,
                    cv=5,
                    verbose=3)

grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
# Compute and format the classification report
class_report = classification_report(y_test, y_pred, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df = class_report_df.round(4)

print("Classification Report:")
print(class_report_df.to_string())

### **PCA 64**

In [None]:
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []

for i in range(len(file_list)):
  file_path = '/content/drive/MyDrive/ML_DL_Datasets/DNA_Datasets/Shuffled_Datasets/Covid_Shuffled_Balanced/Shuffled_Subset{file_counter}.h5'.format(file_counter = i+1)
  read_data = pd.read_hdf(file_path) # Read the current dataset

  data_reshaped = np.array(read_data.drop('Class', axis=1))
  data_labels = read_data['Class']
  clear_variable('read_data')

  X_train_list.append(data_reshaped[:800])
  X_test_list.append(data_reshaped[800:])
  clear_variable('data_reshaped')

  y_train_list.append(data_labels[:800])
  y_test_list.append(data_labels[800:])
  clear_variable('data_labels')

X_train = np.concatenate(X_train_list, axis=0)
clear_variable('X_train_list')

X_test = np.concatenate(X_test_list, axis=0)
clear_variable('X_test_list')

y_train = np.concatenate(y_train_list, axis=0)
clear_variable('y_train_list')

y_test = np.concatenate(y_test_list, axis=0)
clear_variable('y_test_list')

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_64.transform(X_train_batch)) # Transform the X_train with PCA

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list', 'X_train'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_64.transform(X_test_batch)) # Transform the X_test with PCA

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list', 'X_test'])

In [None]:
param_grid = {'C': [800,1200,1800,2500,5000,7500,10000],
              'gamma':[0.3,0.1,0.07,0.03,0.01,0.003]}

grid = GridSearchCV(SVC(),
                    param_grid,
                    cv=5,
                    verbose=3)

grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
# Compute and format the classification report
class_report = classification_report(y_test, y_pred, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df = class_report_df.round(4)

print("Classification Report:")
print(class_report_df.to_string())

### **PCA 128**

In [None]:
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []

for i in range(len(file_list)):
  file_path = '/content/drive/MyDrive/ML_DL_Datasets/DNA_Datasets/Shuffled_Datasets/Covid_Shuffled_Balanced/Shuffled_Subset{file_counter}.h5'.format(file_counter = i+1)
  read_data = pd.read_hdf(file_path) # Read the current dataset

  data_reshaped = np.array(read_data.drop('Class', axis=1))
  data_labels = read_data['Class']
  clear_variable('read_data')

  X_train_list.append(data_reshaped[:800])
  X_test_list.append(data_reshaped[800:])
  clear_variable('data_reshaped')

  y_train_list.append(data_labels[:800])
  y_test_list.append(data_labels[800:])
  clear_variable('data_labels')

X_train = np.concatenate(X_train_list, axis=0)
clear_variable('X_train_list')

X_test = np.concatenate(X_test_list, axis=0)
clear_variable('X_test_list')

y_train = np.concatenate(y_train_list, axis=0)
clear_variable('y_train_list')

y_test = np.concatenate(y_test_list, axis=0)
clear_variable('y_test_list')

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_128.transform(X_train_batch)) # Transform the X_train with PCA

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list', 'X_train'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_128.transform(X_test_batch)) # Transform the X_test with PCA

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list', 'X_test'])

In [None]:
param_grid = {'C': [800,1200,1800,2500,5000,7500,10000],
              'gamma':[0.3,0.1,0.07,0.03,0.01,0.003]}

grid = GridSearchCV(SVC(),
                    param_grid,
                    cv=5,
                    verbose=3)

grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
# Compute and format the classification report
class_report = classification_report(y_test, y_pred, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df = class_report_df.round(4)

print("Classification Report:")
print(class_report_df.to_string())

### **PCA 256**

In [None]:
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []

for i in range(len(file_list)):
  file_path = '/content/drive/MyDrive/ML_DL_Datasets/DNA_Datasets/Shuffled_Datasets/Covid_Shuffled_Balanced/Shuffled_Subset{file_counter}.h5'.format(file_counter = i+1)
  read_data = pd.read_hdf(file_path) # Read the current dataset

  data_reshaped = np.array(read_data.drop('Class', axis=1))
  data_labels = read_data['Class']
  clear_variable('read_data')

  X_train_list.append(data_reshaped[:800])
  X_test_list.append(data_reshaped[800:])
  clear_variable('data_reshaped')

  y_train_list.append(data_labels[:800])
  y_test_list.append(data_labels[800:])
  clear_variable('data_labels')

X_train = np.concatenate(X_train_list, axis=0)
clear_variable('X_train_list')

X_test = np.concatenate(X_test_list, axis=0)
clear_variable('X_test_list')

y_train = np.concatenate(y_train_list, axis=0)
clear_variable('y_train_list')

y_test = np.concatenate(y_test_list, axis=0)
clear_variable('y_test_list')

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_256.transform(X_train_batch)) # Transform the X_train with PCA

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list', 'X_train'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_256.transform(X_test_batch)) # Transform the X_test with PCA

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list', 'X_test'])

In [None]:
param_grid = {'C': [800,1200,1800,2500,5000,7500,10000],
              'gamma':[0.3,0.1,0.07,0.03,0.01,0.003]}

grid = GridSearchCV(SVC(),
                    param_grid,
                    cv=5,
                    verbose=3)

grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
# Compute and format the classification report
class_report = classification_report(y_test, y_pred, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df = class_report_df.round(4)

print("Classification Report:")
print(class_report_df.to_string())