In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import joblib
import gc
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
def clear_variable(var_list):
    for var in var_list:
        globals().pop(var, None)
    gc.collect()

In [None]:
loaded_pca_32 = joblib.load('models/PCA_n32_3-mer_Covid.pkl')
loaded_pca_64 = joblib.load('models/PCA_n64_3-mer_Covid.pkl')
loaded_pca_128 = joblib.load('models/PCA_n128_3-mer_Covid.pkl')
loaded_pca_256 = joblib.load('models/PCA_n256_3-mer_Covid.pkl')

In [None]:
X_train = np.load('PCA_X_train_std_scaled.npy')
X_test = np.load('PCA_X_test_std_scaled.npy')
y_train = np.load('PCA_y_train.npy')
y_test = np.load('PCA_y_test.npy')

## KNN Grid Search

### PCA32

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_32.transform(X_train_batch))

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_32.transform(X_test_batch))

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list'])

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [False]
}

# Instantiate the grid search model
grid = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid=param_grid,
                    cv=3,
                    verbose=3)

# Fit the grid search to the data
grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.arange(1, 9), yticklabels=np.arange(1, 9))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

### PCA64

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_64.transform(X_train_batch))

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_64.transform(X_test_batch))

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list'])

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [False]
}

# Instantiate the grid search model
grid = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid=param_grid,
                    cv=3,
                    verbose=3)

# Fit the grid search to the data
grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.arange(1, 9), yticklabels=np.arange(1, 9))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

### PCA128

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_128.transform(X_train_batch))

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_128.transform(X_test_batch))

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list'])

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [False]
}

# Instantiate the grid search model
grid = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid=param_grid,
                    cv=3,
                    verbose=3)

# Fit the grid search to the data
grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.arange(1, 9), yticklabels=np.arange(1, 9))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

### PCA256

In [None]:
X_train_list = []
X_test_list = []
batch_size = 1000

for start in range(0, len(X_train), batch_size):
  X_train_batch = X_train[start : start + batch_size]
  X_train_list.append(loaded_pca_256.transform(X_train_batch))

X_train_pca = np.concatenate(X_train_list, axis=0)
clear_variable(['X_train_list'])

for start in range(0, len(X_test), batch_size):
  X_test_batch = X_test[start : start + batch_size]
  X_test_list.append(loaded_pca_256.transform(X_test_batch))

X_test_pca = np.concatenate(X_test_list, axis=0)
clear_variable(['X_test_list'])

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [False]
}

# Instantiate the grid search model
grid = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid=param_grid,
                    cv=3,
                    verbose=3)

# Fit the grid search to the data
grid.fit(X_train_pca, y_train)

In [None]:
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print('Best parameters: {x}'.format(x=grid.best_params_))
print("Test Set Accuracy:", accuracy)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.arange(1, 9), yticklabels=np.arange(1, 9))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()