#### Librairies

In [1]:
# External 
import numpy as np 
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib
from tqdm import tqdm
import gc

# Internal 
import utils as u 
from kernel_methods import Kernel_Ridge, Kernel_SVM, Kernel_Logistic
from kernels import spectrum_kernel, weighted_degree_kernel
from mismatch_kernel import mismatch_kernel
from data_augmentation import augment_dataset
from cross_validation import cross_val_score_with_precomputed_kernel, cross_val_predict_with_precomputed_kernel, accuracy_score, mode
from meta_models import Bagging_Kernel, Hierarchical_Bagging_Kernel, EnsembleKernel, NLCK
%load_ext autoreload
%autoreload 2

  from tqdm.autonotebook import tqdm


### Pre-Processing : 

In [2]:
# Dataset 1
dataset1 = u.load_and_merge("data/Xtr0.csv", "data/Ytr0.csv")
print("Dataset 1 shape:", dataset1.shape)

# Dataset 2
dataset2 = u.load_and_merge("data/Xtr1.csv", "data/Ytr1.csv")
print("\nDataset 2 shape:", dataset2.shape)

# Dataset 3 
dataset3 = u.load_and_merge("data/Xtr2.csv", "data/Ytr2.csv")
print("\nDataset 3 shape:", dataset3.shape)

print("\n")
print(dataset1.head())


# Training/Validation Split
X_train1, X_test1, y_train1, y_test1 = u.split_data(dataset1, test_size=0., random_state=42, shuffle = False)
X_train2, X_test2, y_train2, y_test2 = u.split_data(dataset2, test_size=0., random_state=42, shuffle = False)
X_train3, X_test3, y_train3, y_test3 = u.split_data(dataset3, test_size=0., random_state=42, shuffle = False)


# Data Augmentation

X_train_aug1, y_train_aug1 = augment_dataset(X_train1, y_train1)
X_train_aug2, y_train_aug2 = augment_dataset(X_train2, y_train2)
X_train_aug3, y_train_aug3 = augment_dataset(X_train3, y_train3)


Dataset 1 shape: (2000, 3)

Dataset 2 shape: (2000, 3)

Dataset 3 shape: (2000, 3)


   Id  Bound                                                seq
0   0      0  GGAGAATCATTTGAACCCGGGAGGTGGAGGTTGCCGTGAGCTGAGA...
1   1      1  ACCCTGCCTACACCGCGGCGGGGACAGGTGGAGGTTTCAACCCCTG...
2   2      1  TGCAAATCTGTAAGCATTTCTCAGGCAATGAATTATGTCAACACAA...
3   3      0  GCGGGACGTGGGCGTCGAGGGTAAGGATATCTGCAGAAGTACTGTC...
4   4      1  GGAGAATAGCATGTATCCGAGAGGTGGAGCTGGCAGTGAGCCGAGA...


### Pre-Computing Kernels

#### Mismatch

In [None]:

datasets = [
    ("aug1", X_train_aug1),
    ("aug2", X_train_aug2),
    ("aug3", X_train_aug3)
]


tasks = [
    (k, m, name, X_aug)
    for k in range(5, 21)
    for m in [1,2]
    for name, X_aug in datasets
]


with tqdm_joblib(tqdm(desc="Computing kernels", total=len(tasks))) as progress_bar:
    results = Parallel(n_jobs=1)(
        delayed(u.compute_and_save_kernel)(mismatch_kernel, name, X_aug, k=k, m=m, normalize=True)
        for k, m, name, X_aug in tasks
    )





  0%|          | 0/39 [00:00<?, ?it/s]

#### Spectrum

In [None]:
datasets = [
    ("aug1", X_train_aug1),
    ("aug2", X_train_aug2),
    ("aug3", X_train_aug3)
]

tasks = [
    (k, name, X_aug)
    for k in range(5, 21)
    for name, X_aug in datasets
]

with tqdm_joblib(tqdm(total=len(tasks), desc="Computing spectrum kernels", position=1)) as progress_bar:
    results = Parallel(n_jobs=12)(
        delayed(u.compute_and_save_kernel)(spectrum_kernel, name, X_aug, k=k)
        for k, name, X_aug in tasks
    )



  0%|          | 0/48 [00:00<?, ?it/s]

#### Weighted Degree Kernel

In [None]:
datasets = [
    ("aug1", X_train_aug1),
    ("aug2", X_train_aug2),
    ("aug3", X_train_aug3)
]

tasks = [
    (d, name, X_aug)
    for d in range(3, 13)
    for name, X_aug in datasets
]

with tqdm_joblib(tqdm(total=len(tasks), desc="Computing weighted degree kernels", position=1)) as progress_bar:
    results = Parallel(n_jobs=12)(
        delayed(u.compute_and_save_kernel)(weighted_degree_kernel, name, X_aug, d=d, normalize=True)
        for d, name, X_aug in tasks
    )



  0%|          | 0/6 [00:00<?, ?it/s]

### Hyper-parameter Tuning 
#### Stage one : Optimize individual Kernel Parameters with C fixed

#### Dataset 1 

Mismatch Kernel

In [None]:
k_values = range(5, 21)  
m_values = [1, 2]
normalize = True
best_score = -np.inf
best_params = {}

for k in k_values:
    for m in m_values:
        C = 1  

        kernel_file = f"precomputed_kernels/mismatch_kernel_aug1_k{k}_m{m}_normalize{normalize}.npy"
        K = np.load(kernel_file)
        
        estimator = Kernel_SVM(kernel='precomputed', C=1)
        
        scores = cross_val_score_with_precomputed_kernel(estimator, K, y_train1, cv=5, n_jobs=12, n_iter=5)
        mean_score = np.mean(scores)
        
        print(f"Parameters: k = {k}, m = {m}, Mean CV Accuracy: {mean_score:.4f}")
        
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'k': k, 'm': m}

print("Best parameters:", best_params)
print("Best cross-validation accuracy:", best_score)


Parameters: k = 5, m = 1, Mean CV Accuracy: 0.6060
Parameters: k = 5, m = 2, Mean CV Accuracy: 0.5884
Parameters: k = 6, m = 1, Mean CV Accuracy: 0.6205
Parameters: k = 6, m = 2, Mean CV Accuracy: 0.6031
Parameters: k = 7, m = 1, Mean CV Accuracy: 0.6242
Parameters: k = 7, m = 2, Mean CV Accuracy: 0.6085
Parameters: k = 8, m = 1, Mean CV Accuracy: 0.6299
Parameters: k = 8, m = 2, Mean CV Accuracy: 0.6265
Parameters: k = 9, m = 1, Mean CV Accuracy: 0.6342
Parameters: k = 9, m = 2, Mean CV Accuracy: 0.6330
Parameters: k = 10, m = 1, Mean CV Accuracy: 0.6393
Parameters: k = 10, m = 2, Mean CV Accuracy: 0.6430
Parameters: k = 11, m = 1, Mean CV Accuracy: 0.6411
Parameters: k = 11, m = 2, Mean CV Accuracy: 0.6490
Parameters: k = 12, m = 1, Mean CV Accuracy: 0.6320
Parameters: k = 12, m = 2, Mean CV Accuracy: 0.6444
Parameters: k = 13, m = 1, Mean CV Accuracy: 0.6165
Parameters: k = 13, m = 2, Mean CV Accuracy: 0.6346
Parameters: k = 14, m = 1, Mean CV Accuracy: 0.5935
Parameters: k = 14, m 

Spectrum Kernel

In [None]:
k_values = range(5, 20)

best_score = -np.inf
best_params = {}

for k in k_values:
    C = 1

    kernel_file = f"precomputed_kernels/spectrum_kernel_aug1_k{k}.npy"
    K = np.load(kernel_file)
    
    estimator = Kernel_SVM(kernel='precomputed', C=C)
    
    scores = cross_val_score_with_precomputed_kernel(estimator, K, y_train1, cv=5, n_jobs=12, n_iter=5)
    mean_score = np.mean(scores)
    
    print(f"Parameters: k = {k}  Mean CV Accuracy: {mean_score:.4f}")
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = {'k': k}

print("Best parameters:", best_params)
print("Best cross-validation accuracy:", best_score)


Parameters: k = 5  Mean CV Accuracy: 0.6214
Parameters: k = 6  Mean CV Accuracy: 0.6232
Parameters: k = 7  Mean CV Accuracy: 0.6180
Parameters: k = 8  Mean CV Accuracy: 0.6255
Parameters: k = 9  Mean CV Accuracy: 0.6159
Parameters: k = 10  Mean CV Accuracy: 0.6085
Parameters: k = 11  Mean CV Accuracy: 0.5957
Parameters: k = 12  Mean CV Accuracy: 0.5896
Parameters: k = 13  Mean CV Accuracy: 0.5854
Parameters: k = 14  Mean CV Accuracy: 0.5833
Parameters: k = 15  Mean CV Accuracy: 0.5823
Parameters: k = 16  Mean CV Accuracy: 0.5825
Parameters: k = 17  Mean CV Accuracy: 0.5815
Parameters: k = 18  Mean CV Accuracy: 0.5807
Parameters: k = 19  Mean CV Accuracy: 0.5799
Best parameters: {'k': 8}
Best cross-validation accuracy: 0.6255


Weighted Degree Kernels

In [None]:
k_values = range(3, 11)

best_score = -np.inf
best_params = {}

for k in k_values:
    C = 1
    kernel_file = f"precomputed_kernels/weighted_degree_kernel_aug1_d{k}_normalizeTrue.npy"
    K = np.load(kernel_file)
    
    estimator = Kernel_SVM(kernel='precomputed', C=C)
    
    scores = cross_val_score_with_precomputed_kernel(estimator, K, y_train1, cv=5, n_jobs=12, n_iter=1)
    mean_score = np.mean(scores)
    
    print(f"Parameters: k = {k}  Mean CV Accuracy: {mean_score:.4f}")
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = {'k': k}

print("Best parameters:", best_params)
print("Best cross-validation accuracy:", best_score)


Parameters: k = 3  Mean CV Accuracy: 0.5820
Parameters: k = 4  Mean CV Accuracy: 0.5885
Parameters: k = 5  Mean CV Accuracy: 0.5890
Parameters: k = 6  Mean CV Accuracy: 0.5875
Parameters: k = 7  Mean CV Accuracy: 0.5870
Parameters: k = 8  Mean CV Accuracy: 0.5895
Parameters: k = 9  Mean CV Accuracy: 0.5910
Parameters: k = 10  Mean CV Accuracy: 0.5890
Best parameters: {'k': 9}
Best cross-validation accuracy: 0.591


#### Dataset 2

##### Mismatch Kernel

In [None]:
k_values = range(5, 21)  
m_values = [1, 2]
normalize = True

best_score = -np.inf
best_params = {}

for k in k_values:
    for m in m_values:
        C = 1  
        kernel_file = f"precomputed_kernels/mismatch_kernel_aug2_k{k}_m{m}_normalize{normalize}.npy"
        K = np.load(kernel_file)
        
        estimator = Kernel_SVM(kernel='precomputed', C=C)
        
        scores = cross_val_score_with_precomputed_kernel(estimator, K, y_train2, cv=5, n_jobs=12, n_iter=5)
        mean_score = np.mean(scores)
        
        print(f"Parameters: k = {k}, m = {m}, Mean CV Accuracy: {mean_score:.4f}")
        
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'k': k, 'm': m}

print("Best parameters:", best_params)
print("Best cross-validation accuracy:", best_score)


Parameters: k = 5, m = 1, Mean CV Accuracy: 0.6907
Parameters: k = 5, m = 2, Mean CV Accuracy: 0.6569
Parameters: k = 6, m = 1, Mean CV Accuracy: 0.7056
Parameters: k = 6, m = 2, Mean CV Accuracy: 0.6767
Parameters: k = 7, m = 1, Mean CV Accuracy: 0.7257
Parameters: k = 7, m = 2, Mean CV Accuracy: 0.6996
Parameters: k = 8, m = 1, Mean CV Accuracy: 0.7476
Parameters: k = 8, m = 2, Mean CV Accuracy: 0.7189
Parameters: k = 9, m = 1, Mean CV Accuracy: 0.7669
Parameters: k = 9, m = 2, Mean CV Accuracy: 0.7445
Parameters: k = 10, m = 1, Mean CV Accuracy: 0.7749
Parameters: k = 10, m = 2, Mean CV Accuracy: 0.7556
Parameters: k = 11, m = 1, Mean CV Accuracy: 0.7636
Parameters: k = 11, m = 2, Mean CV Accuracy: 0.7684
Parameters: k = 12, m = 1, Mean CV Accuracy: 0.7553
Parameters: k = 12, m = 2, Mean CV Accuracy: 0.7648
Parameters: k = 13, m = 1, Mean CV Accuracy: 0.7507
Parameters: k = 13, m = 2, Mean CV Accuracy: 0.7584
Parameters: k = 14, m = 1, Mean CV Accuracy: 0.7386
Parameters: k = 14, m 

Spectrum Kernel

In [None]:
k_values = range(5, 20)

best_score = -np.inf
best_params = {}

for k in k_values:
    C = 1
    kernel_file = f"precomputed_kernels/spectrum_kernel_aug2_k{k}.npy"
    K = np.load(kernel_file)
    
    estimator = Kernel_SVM(kernel='precomputed', C=C)
    
    scores = cross_val_score_with_precomputed_kernel(estimator, K, y_train2, cv=5, n_jobs=12, n_iter=5)
    mean_score = np.mean(scores)
    
    print(f"Parameters: k = {k}  Mean CV Accuracy: {mean_score:.4f}")
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = {'k': k}

print("Best parameters:", best_params)
print("Best cross-validation accuracy:", best_score)


Parameters: k = 5  Mean CV Accuracy: 0.6884
Parameters: k = 6  Mean CV Accuracy: 0.7101
Parameters: k = 7  Mean CV Accuracy: 0.7364
Parameters: k = 8  Mean CV Accuracy: 0.7549
Parameters: k = 9  Mean CV Accuracy: 0.7606
Parameters: k = 10  Mean CV Accuracy: 0.7531
Parameters: k = 11  Mean CV Accuracy: 0.7359
Parameters: k = 12  Mean CV Accuracy: 0.7215
Parameters: k = 13  Mean CV Accuracy: 0.7064
Parameters: k = 14  Mean CV Accuracy: 0.7003
Parameters: k = 15  Mean CV Accuracy: 0.6985
Parameters: k = 16  Mean CV Accuracy: 0.6984
Parameters: k = 17  Mean CV Accuracy: 0.6979
Parameters: k = 18  Mean CV Accuracy: 0.6971
Parameters: k = 19  Mean CV Accuracy: 0.6967
Best parameters: {'k': 9}
Best cross-validation accuracy: 0.7606


#### Dataset 3


##### Mismatch Kernel

In [None]:
k_values = range(5, 21)  
m_values = [1, 2]
normalize = True

best_score = -np.inf
best_params = {}

for k in k_values:
    for m in m_values:
        C = 1  
        kernel_file = f"precomputed_kernels/mismatch_kernel_aug3_k{k}_m{m}_normalize{normalize}.npy"
        K = np.load(kernel_file)
        
        estimator = Kernel_SVM(kernel='precomputed', C=C)
        
        scores = cross_val_score_with_precomputed_kernel(estimator, K, y_train3, cv=5, n_jobs=12, n_iter=5)
        mean_score = np.mean(scores)
        
        print(f"Parameters: k = {k}, m = {m}, Mean CV Accuracy: {mean_score:.4f}")
        
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'k': k, 'm': m}

print("Best parameters:", best_params)
print("Best cross-validation accuracy:", best_score)


Parameters: k = 5, m = 1, Mean CV Accuracy: 0.6236
Parameters: k = 5, m = 2, Mean CV Accuracy: 0.6058
Parameters: k = 6, m = 1, Mean CV Accuracy: 0.6215
Parameters: k = 6, m = 2, Mean CV Accuracy: 0.6155
Parameters: k = 7, m = 1, Mean CV Accuracy: 0.6266
Parameters: k = 7, m = 2, Mean CV Accuracy: 0.6284
Parameters: k = 8, m = 1, Mean CV Accuracy: 0.6375
Parameters: k = 8, m = 2, Mean CV Accuracy: 0.6340
Parameters: k = 9, m = 1, Mean CV Accuracy: 0.6476
Parameters: k = 9, m = 2, Mean CV Accuracy: 0.6328
Parameters: k = 10, m = 1, Mean CV Accuracy: 0.6541
Parameters: k = 10, m = 2, Mean CV Accuracy: 0.6482
Parameters: k = 11, m = 1, Mean CV Accuracy: 0.6599
Parameters: k = 11, m = 2, Mean CV Accuracy: 0.6513
Parameters: k = 12, m = 1, Mean CV Accuracy: 0.6584
Parameters: k = 12, m = 2, Mean CV Accuracy: 0.6485
Parameters: k = 13, m = 1, Mean CV Accuracy: 0.6511
Parameters: k = 13, m = 2, Mean CV Accuracy: 0.6574
Parameters: k = 14, m = 1, Mean CV Accuracy: 0.6641
Parameters: k = 14, m 

Spectrum Kernel

In [None]:
k_values = range(5, 20)

best_score = -np.inf
best_params = {}

for k in k_values:
    C = 1
    kernel_file = f"precomputed_kernels/spectrum_kernel_aug3_k{k}.npy"
    K = np.load(kernel_file)
    
    estimator = Kernel_SVM(kernel='precomputed', C=C)
    
    scores = cross_val_score_with_precomputed_kernel(estimator, K, y_train3, cv=5, n_jobs=12, n_iter=5)
    mean_score = np.mean(scores)
    
    print(f"Parameters: k = {k}  Mean CV Accuracy: {mean_score:.4f}")
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = {'k': k}

print("Best parameters:", best_params)
print("Best cross-validation accuracy:", best_score)


Parameters: k = 5  Mean CV Accuracy: 0.6167
Parameters: k = 6  Mean CV Accuracy: 0.6119
Parameters: k = 7  Mean CV Accuracy: 0.6318
Parameters: k = 8  Mean CV Accuracy: 0.6515
Parameters: k = 9  Mean CV Accuracy: 0.6621
Parameters: k = 10  Mean CV Accuracy: 0.6738
Parameters: k = 11  Mean CV Accuracy: 0.6795
Parameters: k = 12  Mean CV Accuracy: 0.6846
Parameters: k = 13  Mean CV Accuracy: 0.6836
Parameters: k = 14  Mean CV Accuracy: 0.6802
Parameters: k = 15  Mean CV Accuracy: 0.6768
Parameters: k = 16  Mean CV Accuracy: 0.6729
Parameters: k = 17  Mean CV Accuracy: 0.6721
Parameters: k = 18  Mean CV Accuracy: 0.6693
Parameters: k = 19  Mean CV Accuracy: 0.6672
Best parameters: {'k': 12}
Best cross-validation accuracy: 0.6846000000000001


### Hyper-parameter Tuning 
#### Stage two : Find the best Meta Model


#### Dataset 1 :


Bagging : 

In [None]:
C = 1 
K = np.load("precomputed_kernels/mismatch_kernel_aug1_k11_m2_normalizeTrue.npy")  

candidate_n_estimators = np.arange(5, 11)
results = {}


model = Kernel_SVM(kernel='precomputed', C=C)
base_accuracy = cross_val_score_with_precomputed_kernel(
    model, K, y_train1, cv=5, n_jobs=13, n_iter=5)

print("Base accuracy:", base_accuracy)
for n_est in candidate_n_estimators:
    
    hierarchical_clf = Hierarchical_Bagging_Kernel(
        base_estimator=Kernel_SVM(kernel='precomputed', C=C),
        n_estimators=n_est,
        bootstrap=True,
        augment_data=True,
        random_state=None,  
        n_jobs=1
    )

    accuracy = cross_val_score_with_precomputed_kernel(
        hierarchical_clf, K, y_train1, cv=5, n_jobs=13, n_iter=2
    )
    
    results[n_est] = accuracy
    print(f"n_estimators = {n_est}: Mean Accuracy = {accuracy:.4f}")

best_n = max(results, key=results.get)
print("\n Best n_estimators:", best_n, "with mean accuracy:", results[best_n])



Base accuracy: 0.649
n_estimators = 5: Mean Accuracy = 0.6425
n_estimators = 6: Mean Accuracy = 0.6450
n_estimators = 7: Mean Accuracy = 0.6472
n_estimators = 8: Mean Accuracy = 0.6500
n_estimators = 9: Mean Accuracy = 0.6413
n_estimators = 10: Mean Accuracy = 0.6447

 Best n_estimators: 8 with mean accuracy: 0.65


Sum of Kernels

In [None]:
candidate_kernels = []

# Load mismatch kernels for k = 5...20 and m = 1,2.
for k in range(5, 21):
    for m in [1, 2]:
        try:
            K = u.load_kernel('mismatch', 1, k, m)
            info = {'type': 'mismatch', 'k': k, 'm': m}
            candidate_kernels.append((K, info))
        except Exception as e:
            print(f"Could not load mismatch kernel: k={k}, m={m}. Error: {e}")

# Load spectrum kernels for k = 5...20.
for k in range(5, 21):
    try:
        K = u.load_kernel('spectrum', 1, k)
        info = {'type': 'spectrum', 'k': k}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load spectrum kernel: k={k}. Error: {e}")

# Load weighted degree kernels for d = 3...12.
for d in range(3, 13):
    try:
        K = u.load_kernel('weighted_degree', 1, d=d)
        info = {'type': 'weighted_degree', 'd': d}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load weighted degree kernel: d={d}. Error: {e}")

# Define the starting kernel 
start_kernel_specs = [
    {'type': 'mismatch', 'k': 11, 'm': 2},
    {'type': 'mismatch', 'k': 20, 'm': 1},
    {'type': 'mismatch', 'k': 12, 'm': 2},
    {'type': 'mismatch', 'k': 5, 'm': 1}
]

# Select the fixed starting kernels.
start_kernels = []
for spec in start_kernel_specs:
    found = False
    for idx, candidate in enumerate(candidate_kernels):
        info = candidate[1]
        if info['type'] == spec['type'] and info.get('k') == spec['k'] and info.get('m') == spec['m']:
            start_kernels.append(candidate)
            # Remove the candidate from the list.
            candidate_kernels.pop(idx)
            found = True
            break
    if not found:
        raise ValueError(f"Mismatch kernel with k={spec['k']}, m={spec['m']} was not found!")

selected_kernels = start_kernels.copy()

# Evaluate baseline CV score using the combined starting kernels.
combined_start_kernel = np.sum([sk[0] for sk in start_kernels], axis=0) / len(start_kernels)
base_score = cross_val_score_with_precomputed_kernel(
    estimator=Kernel_SVM(kernel='precomputed', C=1),
    K=combined_start_kernel,
    y=y_train1,
    n_jobs=12,
    n_iter=2
)
print(f"\nStarting with fixed kernels {[sk[1] for sk in start_kernels]} CV score: {base_score:.4f}")

# Greedy forward selection to build an ensemble of 10 kernels.
while len(selected_kernels) < 10:
    best_candidate = None
    best_candidate_score = -np.inf

    for candidate in tqdm(candidate_kernels, desc="Evaluating candidate kernels", leave=False):
        # Combine current selected kernels with the candidate by summing and averaging.
        current_kernels = [sk[0] for sk in selected_kernels] + [candidate[0]]
        combined_kernel = np.sum(current_kernels, axis=0) / len(current_kernels)

        score = cross_val_score_with_precomputed_kernel(
            estimator=Kernel_SVM(kernel='precomputed', C=1),
            K=combined_kernel,
            y=y_train1,
            n_jobs=12,
            n_iter=2
        )
        if score > best_candidate_score:
            best_candidate_score = score
            best_candidate = candidate

    # Add the best candidate to the ensemble.
    selected_kernels.append(best_candidate)
    # Remove the best candidate from the candidate list.
    candidate_kernels = [c for c in candidate_kernels if c[1] != best_candidate[1]]
    print(f"Selected kernel {best_candidate[1]}; combination size: {len(selected_kernels)}; CV score: {best_candidate_score:.4f}\n")

print("Final selected combination of 10 kernels:")
for idx, sk in enumerate(selected_kernels):
    print(f"Kernel {idx+1}: {sk[1]}")

del candidate_kernels
gc.collect()
print("Candidate kernels have been unloaded from memory.")



Starting with fixed kernels [{'type': 'mismatch', 'k': 11, 'm': 2}, {'type': 'mismatch', 'k': 20, 'm': 1}, {'type': 'mismatch', 'k': 12, 'm': 2}, {'type': 'mismatch', 'k': 5, 'm': 1}] CV score: 0.6585


                                                                             

Selected kernel {'type': 'mismatch', 'k': 11, 'm': 1}; combination size: 5; CV score: 0.6600



                                                                             

Selected kernel {'type': 'mismatch', 'k': 17, 'm': 1}; combination size: 6; CV score: 0.6623



                                                                             

Selected kernel {'type': 'mismatch', 'k': 10, 'm': 1}; combination size: 7; CV score: 0.6620



                                                                             

Selected kernel {'type': 'spectrum', 'k': 11}; combination size: 8; CV score: 0.6640



                                                                             

Selected kernel {'type': 'mismatch', 'k': 5, 'm': 2}; combination size: 9; CV score: 0.6620



                                                                             

Selected kernel {'type': 'mismatch', 'k': 10, 'm': 2}; combination size: 10; CV score: 0.6632

Final selected combination of 10 kernels:
Kernel 1: {'type': 'mismatch', 'k': 11, 'm': 2}
Kernel 2: {'type': 'mismatch', 'k': 20, 'm': 1}
Kernel 3: {'type': 'mismatch', 'k': 12, 'm': 2}
Kernel 4: {'type': 'mismatch', 'k': 5, 'm': 1}
Kernel 5: {'type': 'mismatch', 'k': 11, 'm': 1}
Kernel 6: {'type': 'mismatch', 'k': 17, 'm': 1}
Kernel 7: {'type': 'mismatch', 'k': 10, 'm': 1}
Kernel 8: {'type': 'spectrum', 'k': 11}
Kernel 9: {'type': 'mismatch', 'k': 5, 'm': 2}
Kernel 10: {'type': 'mismatch', 'k': 10, 'm': 2}
Candidate kernels have been unloaded from memory.


Ensemble

In [None]:
candidate_kernels = []

# Load mismatch kernels for k = 5...20 and m = 1,2.
for k in range(5, 21):
    for m in [1, 2]:
        try:
            K = u.load_kernel('mismatch', 1, k, m)
            info = {'type': 'mismatch', 'k': k, 'm': m}
            candidate_kernels.append((K, info))
        except Exception as e:
            print(f"Could not load mismatch kernel: k={k}, m={m}. Error: {e}")

# Load spectrum kernels for k = 5...20.
for k in range(5, 21):
    try:
        K = u.load_kernel('spectrum', 1, k)
        info = {'type': 'spectrum', 'k': k}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load spectrum kernel: k={k}. Error: {e}")


# Precompute predictions using the first 1000 labels.
candidate_kernels_with_preds = u.precompute_cv_predictions(candidate_kernels, y_train1)

start_kernel = None
start_idx = None
for idx, candidate in enumerate(candidate_kernels_with_preds):
    K, info, y_pred = candidate
    if info['type'] == 'mismatch' and info['k'] == 11 and info.get('m') == 2:
        start_kernel = candidate
        start_idx = idx
        break
if start_kernel is None:
    raise ValueError("Mismatch kernel with k=10, m=1 was not found!")

# Initialize the ensemble with the starting kernel.
selected_kernels = [start_kernel]
del candidate_kernels_with_preds[start_idx]

# Evaluate and print starting kernel accuracy
base_score = accuracy_score(y_train1, start_kernel[2])
print(f"Starting kernel accuracy: {base_score:.4f}")

while len(selected_kernels) < 12 and candidate_kernels_with_preds:
    if selected_kernels:
        current_preds_list = [sk[2] for sk in selected_kernels]
        current_stacked_preds = np.vstack(current_preds_list)
        current_ensemble_preds, _ = mode(current_stacked_preds, axis=0)
        current_ensemble_preds = current_ensemble_preds.ravel()
        current_score = accuracy_score(y_train1, current_ensemble_preds)
    else:
        current_score = 0

    best_candidate = None
    best_candidate_score = -1  

    # Evaluate each candidate kernel.
    for candidate in candidate_kernels_with_preds:
        preds_list = ([sk[2] for sk in selected_kernels] + [candidate[2]]) if selected_kernels else [candidate[2]]
        stacked_preds = np.vstack(preds_list)
        ensemble_preds, _ = mode(stacked_preds, axis=0)
        ensemble_preds = ensemble_preds.ravel()
        score = accuracy_score(y_train1, ensemble_preds)
        
        if score >= best_candidate_score:
            best_candidate_score = score
            best_candidate = candidate


    # Add the best candidate to the selected kernels.
    selected_kernels.append(best_candidate)
    # Remove the selected candidate from further consideration.
    candidate_kernels_with_preds = [c for c in candidate_kernels_with_preds if c[1] != best_candidate[1]]
    
    print(f"Ensemble accuracy with {len(selected_kernels)} kernels: {best_candidate_score:.4f}")

# Evaluate final ensemble accuracy.
final_preds_list = [sk[2] for sk in selected_kernels]
final_stacked_preds = np.vstack(final_preds_list)
final_ensemble_preds, _ = mode(final_stacked_preds, axis=0)
final_ensemble_preds = final_ensemble_preds.ravel()
final_score = accuracy_score(y_train1, final_ensemble_preds)

print(f"\nFinal ensemble accuracy: {final_score:.4f}")
print("Final selected ensemble kernels:")
for sk in selected_kernels:
    print(sk[1])

del candidate_kernels
gc.collect()
print("Candidate kernels have been unloaded from memory.")


Precomputing CV predictions: 100%|██████████| 48/48 [21:02<00:00, 26.29s/it]


Starting kernel accuracy: 0.6505
Ensemble accuracy with 2 kernels: 0.6500
Ensemble accuracy with 3 kernels: 0.6495
Ensemble accuracy with 4 kernels: 0.6550
Ensemble accuracy with 5 kernels: 0.6595
Ensemble accuracy with 6 kernels: 0.6615
Ensemble accuracy with 7 kernels: 0.6580
Ensemble accuracy with 8 kernels: 0.6625

Final ensemble accuracy: 0.6625
Final selected ensemble kernels:
{'type': 'mismatch', 'k': 11, 'm': 2}
{'type': 'spectrum', 'k': 12}
{'type': 'mismatch', 'k': 11, 'm': 1}
{'type': 'spectrum', 'k': 5}
{'type': 'spectrum', 'k': 8}
{'type': 'mismatch', 'k': 18, 'm': 2}
{'type': 'mismatch', 'k': 12, 'm': 2}
{'type': 'mismatch', 'k': 13, 'm': 1}
Candidate kernels have been unloaded from memory.


#### Dataset 2 :


Bagging : 

In [None]:
C = 1  
K = np.load("precomputed_kernels/mismatch_kernel_aug2_k10_m1_normalizeTrue.npy")  #

candidate_n_estimators = np.arange(5, 11)
results = {}


model = Kernel_SVM(kernel='precomputed', C=C)
base_accuracy = cross_val_score_with_precomputed_kernel(
    model, K, y_train2, cv=5, n_jobs=13, n_iter=2)

print("Base accuracy:", base_accuracy)
print("\n")

for n_est in candidate_n_estimators:
    
    hierarchical_clf = Hierarchical_Bagging_Kernel(
        base_estimator=Kernel_SVM(kernel='precomputed', C=C),
        n_estimators=n_est,
        bootstrap=True,
        augment_data=True,
        random_state=None,  
        n_jobs=1
    )

    accuracy = cross_val_score_with_precomputed_kernel(
        hierarchical_clf, K, y_train2, cv=5, n_jobs=13, n_iter=2
    )
    
    results[n_est] = accuracy
    print(f"n_estimators = {n_est}: Mean Accuracy = {accuracy:.4f}")

# Determine best candidate based on average accuracy.
best_n = max(results, key=results.get)
print("\n Best n_estimators:", best_n, "with mean accuracy:", results[best_n])



Base accuracy: 0.7734999999999999


n_estimators = 5: Mean Accuracy = 0.7560
n_estimators = 6: Mean Accuracy = 0.7568
n_estimators = 7: Mean Accuracy = 0.7543
n_estimators = 8: Mean Accuracy = 0.7612
n_estimators = 9: Mean Accuracy = 0.7532
n_estimators = 10: Mean Accuracy = 0.7490

 Best n_estimators: 8 with mean accuracy: 0.76125


Sum of Kernels

In [None]:
candidate_kernels = []

# Load mismatch kernels for k = 5...20 and m = 1,2.
for k in range(5, 21):
    for m in [1, 2]:
        try:
            K = u.load_kernel('mismatch', 2, k, m)
            info = {'type': 'mismatch', 'k': k, 'm': m}
            candidate_kernels.append((K, info))
        except Exception as e:
            print(f"Could not load mismatch kernel: k={k}, m={m}. Error: {e}")

# Load spectrum kernels for k = 5...20.
for k in range(5, 21):
    try:
        K = u.load_kernel('spectrum', 2, k)
        info = {'type': 'spectrum', 'k': k}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load spectrum kernel: k={k}. Error: {e}")

# Load weighted degree kernels for d = 3...12.
for d in range(3, 13):
    try:
        K = u.load_kernel('weighted_degree', 2, d=d)
        info = {'type': 'weighted_degree', 'd': d}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load weighted degree kernel: d={d}. Error: {e}")

start_kernel_specs = [
    {'type': 'mismatch', 'k': 10, 'm': 1},
    {'type': 'mismatch', 'k': 17, 'm': 1},
    {'type': 'mismatch', 'k': 5, 'm': 2},
    {'type': 'mismatch', 'k': 11, 'm': 1},
    {'type': 'spectrum', 'k': 13},   
    {'type': 'mismatch', 'k': 9, 'm': 1},  # Meilleur combinaison ici 
    {'type': 'spectrum', 'k': 16},
    {'type': 'spectrum', 'k': 12},
    {'type': 'spectrum', 'k': 11},
    {'type': 'mismatch', 'k': 10, 'm': 2},
]

# Select the fixed starting kernels.
start_kernels = []
for spec in start_kernel_specs:
    found = False
    for idx, candidate in enumerate(candidate_kernels):
        info = candidate[1]
        if spec['type'] == info['type']:
            if spec['type'] == 'mismatch':
                if info.get('k') == spec.get('k') and info.get('m') == spec.get('m'):
                    start_kernels.append(candidate)
                    candidate_kernels.pop(idx)
                    found = True
                    break
            elif spec['type'] == 'spectrum':
                if info.get('k') == spec.get('k'):
                    start_kernels.append(candidate)
                    candidate_kernels.pop(idx)
                    found = True
                    break
            elif spec['type'] == 'weighted_degree':
                if info.get('d') == spec.get('d'):
                    start_kernels.append(candidate)
                    candidate_kernels.pop(idx)
                    found = True
                    break
    if not found:
        if spec['type'] == 'mismatch':
            raise ValueError(f"Mismatch kernel with k={spec['k']}, m={spec['m']} was not found!")
        elif spec['type'] == 'spectrum':
            raise ValueError(f"Spectrum kernel with k={spec['k']} was not found!")
        elif spec['type'] == 'weighted_degree':
            raise ValueError(f"Weighted degree kernel with d={spec['d']} was not found!")
        else:
            raise ValueError("Kernel specification not recognized!")

# Initialize the selected kernels with the starting kernels.
selected_kernels = start_kernels.copy()

# Evaluate baseline CV score using the combined starting kernels.
combined_start_kernel = np.sum([sk[0] for sk in start_kernels], axis=0) / len(start_kernels)
base_score = cross_val_score_with_precomputed_kernel(
    estimator=Kernel_SVM(kernel='precomputed', C=1),
    K=combined_start_kernel,
    y=y_train2,
    n_jobs=12,
    n_iter=2
)
print(f"\nStarting with fixed kernels {[sk[1] for sk in start_kernels]} CV score: {base_score:.4f}")

# Greedy forward selection to build an ensemble of 10 kernels.
while len(selected_kernels) < 14:
    best_candidate = None
    best_candidate_score = -np.inf

    for candidate in tqdm(candidate_kernels, desc="Evaluating candidate kernels", leave=False):
        # Combine current selected kernels with the candidate by summing and averaging.
        current_kernels = [sk[0] for sk in selected_kernels] + [candidate[0]]
        combined_kernel = np.sum(current_kernels, axis=0) / len(current_kernels)

        score = cross_val_score_with_precomputed_kernel(
            estimator=Kernel_SVM(kernel='precomputed', C=1),
            K=combined_kernel,
            y=y_train2,
            n_jobs=12,
            n_iter=2
        )
        if score > best_candidate_score:
            best_candidate_score = score
            best_candidate = candidate

    # Add the best candidate to the ensemble.
    selected_kernels.append(best_candidate)
    # Remove the best candidate from the candidate list.
    candidate_kernels = [c for c in candidate_kernels if c[1] != best_candidate[1]]
    print(f"Selected kernel {best_candidate[1]}; combination size: {len(selected_kernels)}; CV score: {best_candidate_score:.4f}\n")

print("Final selected combination of 10 kernels:")
for idx, sk in enumerate(selected_kernels):
    print(f"Kernel {idx+1}: {sk[1]}")

del candidate_kernels
gc.collect()
print("Candidate kernels have been unloaded from memory.")



Starting with fixed kernels [{'type': 'mismatch', 'k': 10, 'm': 1}, {'type': 'mismatch', 'k': 17, 'm': 1}, {'type': 'mismatch', 'k': 5, 'm': 2}, {'type': 'mismatch', 'k': 11, 'm': 1}, {'type': 'spectrum', 'k': 13}, {'type': 'mismatch', 'k': 9, 'm': 1}, {'type': 'spectrum', 'k': 16}, {'type': 'spectrum', 'k': 12}, {'type': 'spectrum', 'k': 11}, {'type': 'mismatch', 'k': 10, 'm': 2}] CV score: 0.7880


                                                                             

Selected kernel {'type': 'weighted_degree', 'd': 12}; combination size: 11; CV score: 0.7883



                                                                             

Selected kernel {'type': 'spectrum', 'k': 10}; combination size: 12; CV score: 0.7880



                                                                    

KeyboardInterrupt: 

Ensemble

In [None]:
candidate_kernels = []

# Load mismatch kernels for k = 5...20 and m = 1,2.
for k in range(5, 21):
    for m in [1, 2]:
        try:
            K = u.load_kernel('mismatch', 2, k, m)
            info = {'type': 'mismatch', 'k': k, 'm': m}
            candidate_kernels.append((K, info))
        except Exception as e:
            print(f"Could not load mismatch kernel: k={k}, m={m}. Error: {e}")

# Load spectrum kernels for k = 5...20.
for k in range(5, 21):
    try:
        K = u.load_kernel('spectrum', 2, k)
        info = {'type': 'spectrum', 'k': k}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load spectrum kernel: k={k}. Error: {e}")


candidate_kernels_with_preds = u.precompute_cv_predictions(candidate_kernels, y_train2)

start_kernel = None
start_idx = None
for idx, candidate in enumerate(candidate_kernels_with_preds):
    K, info, y_pred = candidate
    if info['type'] == 'mismatch' and info['k'] == 10 and info.get('m') == 1:
        start_kernel = candidate
        start_idx = idx
        break
if start_kernel is None:
    raise ValueError("Mismatch kernel with k=10, m=1 was not found!")

# Initialize the ensemble with the starting kernel.
selected_kernels = [start_kernel]
del candidate_kernels_with_preds[start_idx]

# Evaluate and print starting kernel accuracy
base_score = accuracy_score(y_train2, start_kernel[2])
print(f"Starting kernel accuracy: {base_score:.4f}")

# Greedy forward selection to build an ensemble of up to 8 kernels,
while len(selected_kernels) < 12 and candidate_kernels_with_preds:
    if selected_kernels:
        current_preds_list = [sk[2] for sk in selected_kernels]
        current_stacked_preds = np.vstack(current_preds_list)
        current_ensemble_preds, _ = mode(current_stacked_preds, axis=0)
        current_ensemble_preds = current_ensemble_preds.ravel()
        current_score = accuracy_score(y_train2, current_ensemble_preds)
    else:
        current_score = 0

    best_candidate = None
    best_candidate_score = -1  

    # Evaluate each candidate kernel.
    for candidate in candidate_kernels_with_preds:
        preds_list = ([sk[2] for sk in selected_kernels] + [candidate[2]]) if selected_kernels else [candidate[2]]
        stacked_preds = np.vstack(preds_list)
        ensemble_preds, _ = mode(stacked_preds, axis=0)
        ensemble_preds = ensemble_preds.ravel()
        score = accuracy_score(y_train2, ensemble_preds)
        
        if score >= best_candidate_score:
            best_candidate_score = score
            best_candidate = candidate


    # Add the best candidate to the selected kernels.
    selected_kernels.append(best_candidate)
    # Remove the selected candidate from further consideration.
    candidate_kernels_with_preds = [c for c in candidate_kernels_with_preds if c[1] != best_candidate[1]]
    
    print(f"Ensemble accuracy with {len(selected_kernels)} kernels: {best_candidate_score:.4f}")

# Evaluate final ensemble accuracy.
final_preds_list = [sk[2] for sk in selected_kernels]
final_stacked_preds = np.vstack(final_preds_list)
final_ensemble_preds, _ = mode(final_stacked_preds, axis=0)
final_ensemble_preds = final_ensemble_preds.ravel()
final_score = accuracy_score(y_train2, final_ensemble_preds)

print(f"\nFinal ensemble accuracy: {final_score:.4f}")
print("Final selected ensemble kernels:")
for sk in selected_kernels:
    print(sk[1])

del candidate_kernels
gc.collect()
print("Candidate kernels have been unloaded from memory.")


Precomputing CV predictions: 100%|██████████| 48/48 [20:02<00:00, 25.06s/it]


Starting kernel accuracy: 0.7740
Ensemble accuracy with 2 kernels: 0.7755
Ensemble accuracy with 3 kernels: 0.7790
Ensemble accuracy with 4 kernels: 0.7765
Ensemble accuracy with 5 kernels: 0.7805
Ensemble accuracy with 6 kernels: 0.7810
Ensemble accuracy with 7 kernels: 0.7860
Ensemble accuracy with 8 kernels: 0.7850
Ensemble accuracy with 9 kernels: 0.7885
Ensemble accuracy with 10 kernels: 0.7855
Ensemble accuracy with 11 kernels: 0.7880
Ensemble accuracy with 12 kernels: 0.7855

Final ensemble accuracy: 0.7855
Final selected ensemble kernels:
{'type': 'mismatch', 'k': 10, 'm': 1}
{'type': 'mismatch', 'k': 11, 'm': 2}
{'type': 'spectrum', 'k': 13}
{'type': 'mismatch', 'k': 10, 'm': 2}
{'type': 'spectrum', 'k': 9}
{'type': 'spectrum', 'k': 5}
{'type': 'spectrum', 'k': 8}
{'type': 'mismatch', 'k': 6, 'm': 2}
{'type': 'mismatch', 'k': 18, 'm': 1}
{'type': 'mismatch', 'k': 9, 'm': 1}
{'type': 'mismatch', 'k': 17, 'm': 1}
{'type': 'mismatch', 'k': 5, 'm': 2}
Candidate kernels have been u

#### Dataset 3 :


Bagging : 

In [None]:
C = 1  
K = np.load("precomputed_kernels/mismatch_kernel_aug3_k17_m1_normalizeTrue.npy")  

candidate_n_estimators = np.arange(5, 11)
results = {}


model = Kernel_SVM(kernel='precomputed', C=C)
base_accuracy = cross_val_score_with_precomputed_kernel(
    model, K, y_train3, cv=5, n_jobs=13, n_iter=2)

print("Base accuracy:", base_accuracy)
print("\n")
for n_est in candidate_n_estimators:
    
    hierarchical_clf = Hierarchical_Bagging_Kernel(
        base_estimator=Kernel_SVM(kernel='precomputed', C=C),
        n_estimators=n_est,
        bootstrap=True,
        augment_data=True,
        random_state=None,  
        n_jobs=1
    )

    accuracy = cross_val_score_with_precomputed_kernel(
        hierarchical_clf, K, y_train3, cv=5, n_jobs=13, n_iter=2
    )
    
    results[n_est] = accuracy
    print(f"n_estimators = {n_est}: Mean Accuracy = {accuracy:.4f}")

# Determine best candidate based on average accuracy.
best_n = max(results, key=results.get)
print("\n Best n_estimators:", best_n, "with mean accuracy:", results[best_n])



Base accuracy: 0.68225


n_estimators = 5: Mean Accuracy = 0.6485
n_estimators = 6: Mean Accuracy = 0.6175
n_estimators = 7: Mean Accuracy = 0.6337
n_estimators = 8: Mean Accuracy = 0.6085
n_estimators = 9: Mean Accuracy = 0.6030
n_estimators = 10: Mean Accuracy = 0.6010

 Best n_estimators: 5 with mean accuracy: 0.6485


Sum of Kernels

In [None]:
candidate_kernels = []

# Load mismatch kernels for k = 5...20 and m = 1,2.
for k in range(5, 21):
    for m in [1, 2]:
        try:
            K = u.load_kernel('mismatch', 3, k, m)
            info = {'type': 'mismatch', 'k': k, 'm': m}
            candidate_kernels.append((K, info))
        except Exception as e:
            print(f"Could not load mismatch kernel: k={k}, m={m}. Error: {e}")

# Load spectrum kernels for k = 5...20.
for k in range(5, 21):
    try:
        K = u.load_kernel('spectrum', 3, k)
        info = {'type': 'spectrum', 'k': k}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load spectrum kernel: k={k}. Error: {e}")

# Load weighted degree kernels for d = 3...12.
for d in range(3, 13):
    try:
        K = u.load_kernel('weighted_degree', 3, d=d)
        info = {'type': 'weighted_degree', 'd': d}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load weighted degree kernel: d={d}. Error: {e}")

start_kernel_specs = [
    {'type': 'mismatch', 'k': 17, 'm': 1},
    {'type': 'spectrum', 'k': 12},
    {'type': 'spectrum', 'k': 13},
    {'type': 'mismatch', 'k': 14, 'm': 2},
]

# Select the fixed starting kernels.
start_kernels = []
for spec in start_kernel_specs:
    found = False
    for idx, candidate in enumerate(candidate_kernels):
        info = candidate[1]
        if spec['type'] == info['type']:
            if spec['type'] == 'mismatch':
                if info.get('k') == spec.get('k') and info.get('m') == spec.get('m'):
                    start_kernels.append(candidate)
                    candidate_kernels.pop(idx)
                    found = True
                    break
            elif spec['type'] == 'spectrum':
                if info.get('k') == spec.get('k'):
                    start_kernels.append(candidate)
                    candidate_kernels.pop(idx)
                    found = True
                    break
            elif spec['type'] == 'weighted_degree':
                if info.get('d') == spec.get('d'):
                    start_kernels.append(candidate)
                    candidate_kernels.pop(idx)
                    found = True
                    break
    if not found:
        if spec['type'] == 'mismatch':
            raise ValueError(f"Mismatch kernel with k={spec['k']}, m={spec['m']} was not found!")
        elif spec['type'] == 'spectrum':
            raise ValueError(f"Spectrum kernel with k={spec['k']} was not found!")
        elif spec['type'] == 'weighted_degree':
            raise ValueError(f"Weighted degree kernel with d={spec['d']} was not found!")
        else:
            raise ValueError("Kernel specification not recognized!")

# Initialize the selected kernels with the starting kernels.
selected_kernels = start_kernels.copy()

# Evaluate baseline CV score using the combined starting kernels.
combined_start_kernel = np.sum([sk[0] for sk in start_kernels], axis=0) / len(start_kernels)
base_score = cross_val_score_with_precomputed_kernel(
    estimator=Kernel_SVM(kernel='precomputed', C=1),
    K=combined_start_kernel,
    y=y_train3,
    n_jobs=12,
    n_iter=2
)
print(f"\nStarting with fixed kernels {[sk[1] for sk in start_kernels]} CV score: {base_score:.4f}")

# Greedy forward selection to build an ensemble of 10 kernels.
while len(selected_kernels) < 14:
    best_candidate = None
    best_candidate_score = -np.inf
    
    for candidate in tqdm(candidate_kernels, desc="Evaluating candidate kernels", leave=False):
        # Combine current selected kernels with the candidate by summing and averaging.
        current_kernels = [sk[0] for sk in selected_kernels] + [candidate[0]]
        combined_kernel = np.sum(current_kernels, axis=0) / len(current_kernels)

        score = cross_val_score_with_precomputed_kernel(
            estimator=Kernel_SVM(kernel='precomputed', C=1),
            K=combined_kernel,
            y=y_train3,
            n_jobs=12,
            n_iter=2
        )
        if score > best_candidate_score:
            best_candidate_score = score
            best_candidate = candidate

    # Add the best candidate to the ensemble.
    selected_kernels.append(best_candidate)
    # Remove the best candidate from the candidate list.
    candidate_kernels = [c for c in candidate_kernels if c[1] != best_candidate[1]]
    print(f"Selected kernel {best_candidate[1]}; combination size: {len(selected_kernels)}; CV score: {best_candidate_score:.4f}\n")

print("Final selected combination of 10 kernels:")
for idx, sk in enumerate(selected_kernels):
    print(f"Kernel {idx+1}: {sk[1]}")

del candidate_kernels
gc.collect()
print("Candidate kernels have been unloaded from memory.")



Starting with fixed kernels [{'type': 'mismatch', 'k': 17, 'm': 1}, {'type': 'spectrum', 'k': 12}, {'type': 'spectrum', 'k': 13}, {'type': 'mismatch', 'k': 14, 'm': 2}] CV score: 0.6875


                                                                             

Selected kernel {'type': 'mismatch', 'k': 19, 'm': 1}; combination size: 5; CV score: 0.6875



                                                                             

Selected kernel {'type': 'mismatch', 'k': 15, 'm': 2}; combination size: 6; CV score: 0.6885



                                                                             

Selected kernel {'type': 'mismatch', 'k': 16, 'm': 1}; combination size: 7; CV score: 0.6883



                                                                              

Selected kernel {'type': 'weighted_degree', 'd': 12}; combination size: 8; CV score: 0.6885



                                                                            

KeyboardInterrupt: 

Ensemble

In [None]:
candidate_kernels = []

# Load mismatch kernels for k = 5...20 and m = 1,2.
for k in range(5, 21):
    for m in [1, 2]:
        try:
            K = u.load_kernel('mismatch', 3, k, m)
            info = {'type': 'mismatch', 'k': k, 'm': m}
            candidate_kernels.append((K, info))
        except Exception as e:
            print(f"Could not load mismatch kernel: k={k}, m={m}. Error: {e}")

# Load spectrum kernels for k = 5...20.
for k in range(5, 21):
    try:
        K = u.load_kernel('spectrum', 3, k)
        info = {'type': 'spectrum', 'k': k}
        candidate_kernels.append((K, info))
    except Exception as e:
        print(f"Could not load spectrum kernel: k={k}. Error: {e}")


candidate_kernels_with_preds = u.precompute_cv_predictions(candidate_kernels, y_train3)

start_kernel = None
start_idx = None
for idx, candidate in enumerate(candidate_kernels_with_preds):
    K, info, y_pred = candidate
    if info['type'] == 'mismatch' and info['k'] == 17 and info.get('m') == 1:
        start_kernel = candidate
        start_idx = idx
        break
if start_kernel is None:
    raise ValueError("Mismatch kernel with k=10, m=1 was not found!")

# Initialize the ensemble with the starting kernel.
selected_kernels = [start_kernel]
del candidate_kernels_with_preds[start_idx]

# Evaluate and print starting kernel accuracy
base_score = accuracy_score(y_train3, start_kernel[2])
print(f"Starting kernel accuracy: {base_score:.4f}")

# Greedy forward selection to build an ensemble of up to 8 kernels,
while len(selected_kernels) < 12 and candidate_kernels_with_preds:
    if selected_kernels:
        current_preds_list = [sk[2] for sk in selected_kernels]
        current_stacked_preds = np.vstack(current_preds_list)
        current_ensemble_preds, _ = mode(current_stacked_preds, axis=0)
        current_ensemble_preds = current_ensemble_preds.ravel()
        current_score = accuracy_score(y_train3, current_ensemble_preds)
    else:
        current_score = 0

    best_candidate = None
    best_candidate_score = -1  

    # Evaluate each candidate kernel.
    for candidate in candidate_kernels_with_preds:
        preds_list = ([sk[2] for sk in selected_kernels] + [candidate[2]]) if selected_kernels else [candidate[2]]
        stacked_preds = np.vstack(preds_list)
        ensemble_preds, _ = mode(stacked_preds, axis=0)
        ensemble_preds = ensemble_preds.ravel()
        score = accuracy_score(y_train3, ensemble_preds)
        
        if score >= best_candidate_score:
            best_candidate_score = score
            best_candidate = candidate


    # Add the best candidate to the selected kernels.
    selected_kernels.append(best_candidate)
    # Remove the selected candidate from further consideration.
    candidate_kernels_with_preds = [c for c in candidate_kernels_with_preds if c[1] != best_candidate[1]]
    
    print(f"Ensemble accuracy with {len(selected_kernels)} kernels: {best_candidate_score:.4f}")

# Evaluate final ensemble accuracy.
final_preds_list = [sk[2] for sk in selected_kernels]
final_stacked_preds = np.vstack(final_preds_list)
final_ensemble_preds, _ = mode(final_stacked_preds, axis=0)
final_ensemble_preds = final_ensemble_preds.ravel()
final_score = accuracy_score(y_train3, final_ensemble_preds)

print(f"\nFinal ensemble accuracy: {final_score:.4f}")
print("Final selected ensemble kernels:")
for sk in selected_kernels:
    print(sk[1])

del candidate_kernels
gc.collect()
print("Candidate kernels have been unloaded from memory.")


Precomputing CV predictions: 100%|██████████| 48/48 [21:47<00:00, 27.24s/it]


Starting kernel accuracy: 0.6745
Ensemble accuracy with 2 kernels: 0.6755
Ensemble accuracy with 3 kernels: 0.6800
Ensemble accuracy with 4 kernels: 0.6795
Ensemble accuracy with 5 kernels: 0.6805
Ensemble accuracy with 6 kernels: 0.6810
Ensemble accuracy with 7 kernels: 0.6820
Ensemble accuracy with 8 kernels: 0.6815
Ensemble accuracy with 9 kernels: 0.6825
Ensemble accuracy with 10 kernels: 0.6825
Ensemble accuracy with 11 kernels: 0.6835
Ensemble accuracy with 12 kernels: 0.6835

Final ensemble accuracy: 0.6835
Final selected ensemble kernels:
{'type': 'mismatch', 'k': 17, 'm': 1}
{'type': 'spectrum', 'k': 12}
{'type': 'spectrum', 'k': 7}
{'type': 'spectrum', 'k': 11}
{'type': 'spectrum', 'k': 20}
{'type': 'mismatch', 'k': 7, 'm': 2}
{'type': 'spectrum', 'k': 19}
{'type': 'mismatch', 'k': 12, 'm': 1}
{'type': 'mismatch', 'k': 5, 'm': 2}
{'type': 'spectrum', 'k': 10}
{'type': 'spectrum', 'k': 15}
{'type': 'mismatch', 'k': 7, 'm': 1}
Candidate kernels have been unloaded from memory.


### Hyper-parameter Tuning

#### Stage three : Optimizing C  

In [None]:
K1 = np.load("precomputed_kernels/mismatch_kernel_aug1_k11_m2_normalizeTrue.npy")
K2 = np.load("precomputed_kernels/mismatch_kernel_aug1_k20_m1_normalizeTrue.npy")
K3 = np.load("precomputed_kernels/mismatch_kernel_aug1_k12_m2_normalizeTrue.npy")
K4 = np.load("precomputed_kernels/mismatch_kernel_aug1_k5_m1_normalizeTrue.npy")
K5 = np.load("precomputed_kernels/mismatch_kernel_aug1_k11_m1_normalizeTrue.npy")
K6 = np.load("precomputed_kernels/mismatch_kernel_aug1_k17_m1_normalizeTrue.npy")
K7 = np.load("precomputed_kernels/mismatch_kernel_aug1_k10_m1_normalizeTrue.npy")
K8 = np.load("precomputed_kernels/spectrum_kernel_aug1_k11.npy")

Kernel_list = [K1, K2, K3, K4, K5, K6, K7, K8]

K = np.sum(Kernel_list, axis=0) / len(Kernel_list)

C_values = np.arange(1.1, 1.3, 0.01)

best_C = None
best_accuracy = 0


for C in C_values:
    cv_predictions = []
    
    acc = cross_val_score_with_precomputed_kernel(
        Kernel_SVM(C=C, kernel='precomputed'), K, y_train1, cv=5, n_jobs=13, n_iter=5
    ) 
   
    print(f"C = {C}: Ensemble cross-validated accuracy = {acc:.4f}")
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_C = C

print(f"\nBest C value is {best_C} with an ensemble accuracy of {best_accuracy:.4f}")

C = 1.1: Ensemble cross-validated accuracy = 0.6595
C = 1.11: Ensemble cross-validated accuracy = 0.6595
C = 1.12: Ensemble cross-validated accuracy = 0.6598
C = 1.1300000000000001: Ensemble cross-validated accuracy = 0.6601
C = 1.1400000000000001: Ensemble cross-validated accuracy = 0.6602
C = 1.1500000000000001: Ensemble cross-validated accuracy = 0.6609
C = 1.1600000000000001: Ensemble cross-validated accuracy = 0.6606
C = 1.1700000000000002: Ensemble cross-validated accuracy = 0.6606
C = 1.1800000000000002: Ensemble cross-validated accuracy = 0.6604
C = 1.1900000000000002: Ensemble cross-validated accuracy = 0.6610
C = 1.2000000000000002: Ensemble cross-validated accuracy = 0.6603
C = 1.2100000000000002: Ensemble cross-validated accuracy = 0.6600
C = 1.2200000000000002: Ensemble cross-validated accuracy = 0.6602
C = 1.2300000000000002: Ensemble cross-validated accuracy = 0.6602
C = 1.2400000000000002: Ensemble cross-validated accuracy = 0.6600
C = 1.2500000000000002: Ensemble cross

In [None]:
K1 = np.load("precomputed_kernels/mismatch_kernel_aug2_k10_m1_normalizeTrue.npy")
K2 = np.load("precomputed_kernels/mismatch_kernel_aug2_k17_m1_normalizeTrue.npy")
K3 = np.load("precomputed_kernels/mismatch_kernel_aug2_k5_m2_normalizeTrue.npy")
K4 = np.load("precomputed_kernels/mismatch_kernel_aug2_k11_m1_normalizeTrue.npy")
K5 = np.load("precomputed_kernels/mismatch_kernel_aug2_k9_m1_normalizeTrue.npy")
K6 = np.load("precomputed_kernels/spectrum_kernel_aug2_k13.npy")

Kernel_list = [K1, K2, K3, K4, K5, K6]

K = np.sum(Kernel_list, axis=0) / len(Kernel_list)

C_values = np.arange(1, 1.2, 0.01)

best_C = None
best_accuracy = 0


for C in C_values:
    cv_predictions = []
    
    acc = cross_val_score_with_precomputed_kernel(
        Kernel_SVM(C=C, kernel='precomputed'), K, y_train2, cv=5, n_jobs=13, n_iter=5
    ) 
   
    print(f"C = {C}: Ensemble cross-validated accuracy = {acc:.4f}")
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_C = C

print(f"\nBest C value is {best_C} with an ensemble accuracy of {best_accuracy:.4f}")

C = 1.0: Ensemble cross-validated accuracy = 0.7880
C = 1.01: Ensemble cross-validated accuracy = 0.7878
C = 1.02: Ensemble cross-validated accuracy = 0.7876
C = 1.03: Ensemble cross-validated accuracy = 0.7882
C = 1.04: Ensemble cross-validated accuracy = 0.7880
C = 1.05: Ensemble cross-validated accuracy = 0.7883
C = 1.06: Ensemble cross-validated accuracy = 0.7877
C = 1.07: Ensemble cross-validated accuracy = 0.7883
C = 1.08: Ensemble cross-validated accuracy = 0.7883
C = 1.09: Ensemble cross-validated accuracy = 0.7879
C = 1.1: Ensemble cross-validated accuracy = 0.7882
C = 1.11: Ensemble cross-validated accuracy = 0.7879
C = 1.12: Ensemble cross-validated accuracy = 0.7878
C = 1.1300000000000001: Ensemble cross-validated accuracy = 0.7876
C = 1.1400000000000001: Ensemble cross-validated accuracy = 0.7875
C = 1.1500000000000001: Ensemble cross-validated accuracy = 0.7881
C = 1.1600000000000001: Ensemble cross-validated accuracy = 0.7877
C = 1.1700000000000002: Ensemble cross-valida

In [None]:
K1 = np.load("precomputed_kernels/mismatch_kernel_aug3_k17_m1_normalizeTrue.npy")
K2 = np.load("precomputed_kernels/spectrum_kernel_aug3_k12.npy")
K3 = np.load("precomputed_kernels/spectrum_kernel_aug3_k13.npy")
K4 = np.load("precomputed_kernels/mismatch_kernel_aug3_k14_m2_normalizeTrue.npy")
K5 = np.load("precomputed_kernels/mismatch_kernel_aug3_k19_m1_normalizeTrue.npy")
K6 = np.load("precomputed_kernels/mismatch_kernel_aug3_k15_m2_normalizeTrue.npy")

Kernel_list = [K1, K2, K3, K4, K5, K6]

K = np.sum(Kernel_list, axis=0) / len(Kernel_list)

C_values = np.arange(0.9, 1.1, 0.01)

best_C = None
best_accuracy = 0

for C in C_values:
    cv_predictions = []
    
    acc = cross_val_score_with_precomputed_kernel(
        Kernel_SVM(C=C, kernel='precomputed'), K, y_train3, cv=5, n_jobs=13, n_iter=5
    ) 
   
    print(f"C = {C}: Ensemble cross-validated accuracy = {acc:.4f}")

    if acc > best_accuracy:
        best_accuracy = acc
        best_C = C

print(f"\nBest C value is {best_C} with an ensemble accuracy of {best_accuracy:.4f}")

C = 0.9: Ensemble cross-validated accuracy = 0.6826
C = 0.91: Ensemble cross-validated accuracy = 0.6833
C = 0.92: Ensemble cross-validated accuracy = 0.6842
C = 0.93: Ensemble cross-validated accuracy = 0.6847
C = 0.9400000000000001: Ensemble cross-validated accuracy = 0.6853
C = 0.9500000000000001: Ensemble cross-validated accuracy = 0.6852
C = 0.9600000000000001: Ensemble cross-validated accuracy = 0.6857
C = 0.9700000000000001: Ensemble cross-validated accuracy = 0.6869
C = 0.9800000000000001: Ensemble cross-validated accuracy = 0.6873
C = 0.9900000000000001: Ensemble cross-validated accuracy = 0.6871
C = 1.0: Ensemble cross-validated accuracy = 0.6873
C = 1.0100000000000002: Ensemble cross-validated accuracy = 0.6856
C = 1.02: Ensemble cross-validated accuracy = 0.6844
C = 1.0300000000000002: Ensemble cross-validated accuracy = 0.6830
C = 1.04: Ensemble cross-validated accuracy = 0.6825
C = 1.0500000000000003: Ensemble cross-validated accuracy = 0.6826
C = 1.06: Ensemble cross-val