In [None]:
import pandas as pd
import numpy as np
import joblib
import gc
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
def clear_variable(var_list):
    for var in var_list:
        globals().pop(var, None)
    gc.collect()

## **Training SVM**

In [None]:
loaded_pca_32 = joblib.load('models/PCA_n32_3-mer_Covid.pkl')
loaded_pca_64 = joblib.load('models/PCA_n64_3-mer_Covid.pkl')
loaded_pca_128 = joblib.load('models/PCA_n128_3-mer_Covid.pkl')
loaded_pca_256 = joblib.load('models/PCA_n256_3-mer_Covid.pkl')

In [None]:
X_train = np.load('PCA_X_train_std_scaled.npy')
X_test = np.load('PCA_X_test_std_scaled.npy')
y_train = np.load('PCA_y_train.npy')
y_test = np.load('PCA_y_test.npy')

## **SVM Grid Search**

### PCA32

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_32.transform(X_train_batch))

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_32.transform(X_test_batch))

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list'])

In [None]:
param_grid = {'C': [800,1200,1800,2500,5000],
              'gamma':[0.3,0.1,0.07,0.03,0.01,0.003,0.001]}

grid = GridSearchCV(SVC(),
                    param_grid,
                    cv=5,
                    verbose=3)

grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

class_report = classification_report(y_test, y_pred, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df = class_report_df.round(4)

print("Classification Report:")
print(class_report_df.to_string())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.arange(1, 9), yticklabels=np.arange(1, 9))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

### PCA64

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_64.transform(X_train_batch)) # Transform the X_train with PCA

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_64.transform(X_test_batch)) # Transform the X_test with PCA

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list'])

In [None]:
param_grid = {'C': [800,1200,1800,2500,5000],
              'gamma':[0.3,0.1,0.07,0.03,0.01,0.003,0.001]}

grid = GridSearchCV(SVC(),
                    param_grid,
                    cv=5,
                    verbose=3)

grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
# Compute and format the classification report
class_report = classification_report(y_test, y_pred, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df = class_report_df.round(4)

print("Classification Report:")
print(class_report_df.to_string())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, predictions)
conf_mat_labels= ['Alpha', 'BA.2.12.1', 'Delta', 'BQ.1.1', 'BA.1.1', 'Gamma', 'BA.5.4', 'BA.4.6']

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=conf_mat_labels[:len(conf_matrix)],  # Adjust length if necessary
            yticklabels=conf_mat_labels[:len(conf_matrix)])  # Adjust length if necessary
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')

# Ensure horizontal y-tick labels
plt.yticks(rotation=0)
plt.show()

### PCA128

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_128.transform(X_train_batch)) # Transform the X_train with PCA

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_128.transform(X_test_batch)) # Transform the X_test with PCA

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list'])

In [None]:
param_grid = {'C': [800,1200,1800,2500,5000],
              'gamma':[0.3,0.1,0.07,0.03,0.01,0.003,0.001]}

grid = GridSearchCV(SVC(),
                    param_grid,
                    cv=5,
                    verbose=3)

grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.arange(1, 9), yticklabels=np.arange(1, 9))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

### PCA256

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_256.transform(X_train_batch)) # Transform the X_train with PCA

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_256.transform(X_test_batch)) # Transform the X_test with PCA

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list'])

In [None]:
param_grid = {'C': [800,1200,1800,2500,5000],
              'gamma':[0.3,0.1,0.07,0.03,0.01,0.003,0.001]}

grid = GridSearchCV(SVC(),
                    param_grid,
                    cv=5,
                    verbose=3)

grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.arange(1, 9), yticklabels=np.arange(1, 9))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()