In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from collections import Counter

2025-03-31 19:36:03.779925: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load binary feature matrix for each histone mark
'''Replace tsv file with "H3K27me3_CUTOFF10000_CUTOFF10000_metapeak_counts_binary.tsv", 
"H3K36me3_CUTOFF10000_CUTOFF10000_metapeak_counts_binary.tsv", "H3K4me1_CUTOFF10000_CUTOFF10000_metapeak_counts_binary.tsv",
"H3K4me3_CUTOFF10000_CUTOFF10000_metapeak_counts_binary.tsv", "H3K9me3_CUTOFF10000_CUTOFF10000_metapeak_counts_binary.tsv"
based on the histone mark you want to do the analysis on.
'''
data = pd.read_csv("H3K27ac_CUTOFF10000_CUTOFF10000_metapeak_counts_binary.tsv", sep = "\t")
data = data.transpose()
data.reset_index(drop = True, inplace = True)
data

Unnamed: 0,chr1_840917_844971_n_1,chr1_999222_1001990_n_2,chr1_1024892_1028188_n_3,chr1_1135735_1138195_n_4,chr1_1217473_1220087_n_5,chr1_1297695_1303428_n_6,chr1_1356115_1362473_n_7,chr1_1398275_1400662_n_8,chr1_1418435_1435676_n_9,chr1_1539204_1541760_n_10,...,chrY_11063329_11066391_n_19763,chrY_12661354_12663911_n_19764,chrY_13477999_13480655_n_19765,chrY_14523175_14526479_n_19766,chrY_19075455_19078099_n_19767,chrY_21072393_21083950_n_19768,chrY_21135972_21142739_n_19769,chrY_21173280_21177987_n_19770,chrY_21238808_21243547_n_19771,chrY_21247559_21259006_n_19772
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,1,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1560,0,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1561,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1562,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1563,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Different encoding dimensions and L1 regularization values to test
encoding_dims = [5, 10, 20, 50, 100, 500]  # Bottleneck sizes (autoencoder latent dimensions)
l1_values = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]  # L1 regularization strengths
n_splits = 5  # Number of folds in cross-validation
feature_selection_thresholds = [0.6, 0.7, 0.8, 0.9]  # Thresholds to test for feature selection

In [4]:
# Initialize dictionaries for storing results
selected_features_per_dim = {} # Final selected features for each encoding dimension
results = {} # Store results per encoding_dim and L1 value
all_loss_histories = {}  # Store average loss curves for each (encoding_dim, L1) combo

In [5]:
# Cross-validation setup
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [6]:
# Perform K-Fold cross-validation
# Iterate over combinations of encoding dimension and L1 regularization strength
for encoding_dim in encoding_dims:
    for l1_value in l1_values:
        print(f"\nProcessing Encoding Dimension: {encoding_dim} L1 value: {l1_value}")

        # Dictionary to store the best threshold chosen in each fold
        fold_best_thresholds = []  
        fold_selected_features = []
        fold_feature_importances = []
        fold_train_losses = []  # Store training loss per fold
        fold_val_losses = []  # Store validation loss per fold
        fold_histories = []  # Store loss history for each fold
        all_train_losses = []
        all_val_losses = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
            print(f"  Fold {fold + 1}/{n_splits}")
            train_data, val_data = data.iloc[train_idx], data.iloc[val_idx]

            # Build autoencoder model
            input_dim = data.shape[1]
            input_layer = keras.Input(shape=(input_dim,))

            # Encoder with L1 regularization
            encoded = layers.Dense(512, activation="relu", activity_regularizer=regularizers.l1(l1_value))(input_layer)
            encoded = layers.Dense(256, activation="relu")(encoded)
            encoded = layers.Dense(encoding_dim, activation="sigmoid")(encoded)

            # Decoder
            decoded = layers.Dense(256, activation="relu")(encoded)
            decoded = layers.Dense(512, activation="relu")(decoded)
            decoded = layers.Dense(input_dim, activation="sigmoid")(decoded)

            autoencoder = keras.Model(input_layer, decoded)
            autoencoder.compile(optimizer="adam", loss="mse")

            # Early stopping to prevent overfitting
            early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

            # Train autoencoder
            history = autoencoder.fit(
                train_data, train_data,
                epochs=100, batch_size=32, verbose=0,
                validation_data=(val_data, val_data),
                callbacks=[early_stopping]
            )

            # Store loss history for graphing later
            fold_histories.append(history.history) # Store per-fold history

            # Store last training and validation loss
            fold_train_losses.append(history.history['loss'][-1])
            fold_val_losses.append(history.history['val_loss'][-1])

            # Store loss histories
            all_train_losses.append(history.history['loss']) # All epoch train losses
            all_val_losses.append(history.history['val_loss']) # All epoch val losses

            # Extract encoder model and weights for feature importance
            encoder = keras.Model(input_layer, encoded)
            weights = encoder.layers[1].get_weights()[0]  # Get first encoder layer weights
            feature_importance = np.mean(np.abs(weights), axis=1)  # Mean absolute weight per feature
            
            # Assume feature_importance is already calculated (from encoder weights)
            feature_importance = np.array(feature_importance)  # Convert to NumPy array

            # Normalize feature importance values (scale between 0 and 1)
            feature_importance = (feature_importance - feature_importance.min()) / (feature_importance.max() - feature_importance.min())
            
            fold_feature_importances.append(feature_importance)  # Store for later us
            
            # Find best feature selection threshold
            target_num_features = 100
            best_threshold, best_num_features = None, float('inf')
            best_selected_features = []
            
            # Find best threshold that keeps number of features 
            for threshold in feature_selection_thresholds:
                selected_indices = np.where(feature_importance > threshold)[0]
                num_selected = len(selected_indices)

                if num_selected == 0:
                    continue  # Skip if no features selected

                if num_selected < target_num_features:
                    best_threshold = threshold
                    best_num_features = num_selected
                    best_selected_features = selected_indices
                    break

            print(f"Best Threshold for Fold {fold + 1}: {best_threshold}, Features: {best_num_features}")

            # Store best threshold and selected features for this fold
            fold_best_thresholds.append(best_threshold)
            fold_selected_features.append(best_selected_features)

         # **Padding Losses to the Same Length**
        max_epochs = max(len(loss) for loss in all_train_losses)  # Find the longest history

        def pad_loss(loss, max_len):
            return np.pad(loss, (0, max_len - len(loss)), mode='edge')  # Repeat last value

        all_train_losses = np.array([pad_loss(loss, max_epochs) for loss in all_train_losses])
        all_val_losses = np.array([pad_loss(loss, max_epochs) for loss in all_val_losses])

        # Average loss across folds
        avg_train_losses = np.mean(all_train_losses, axis=0)
        avg_val_losses = np.mean(all_val_losses, axis=0)
        all_loss_histories[f"EncDim={encoding_dim} L1value={l1_value}"] = (avg_train_losses, avg_val_losses)

        # Compute average losses over folds
        avg_train_loss = np.mean(fold_train_losses)
        avg_val_loss = np.mean(fold_val_losses)
        print(f"\nFinal Avg Training Loss: {avg_train_loss:.6f}")
        print(f"Final Avg Validation Loss: {avg_val_loss:.6f}")

        # Determine most frequent threshold across folds
        threshold_counter = Counter(fold_best_thresholds)
        best_global_threshold = threshold_counter.most_common(1)[0][0]

        print(f"\nBest Global Threshold for Encoding Dimension {encoding_dim} and L1 value {l1_value}: {best_global_threshold}")

        # **Step 2: Select Features Across All Folds Using Best Global Threshold (Without Recomputing)**
        global_selected_features = set()

        for feature_importance in fold_feature_importances:
            selected_indices = np.where(feature_importance > best_global_threshold)[0]
            global_selected_features.update(selected_indices)

        print(f"\nFinal Selected Features for Encoding Dimension {encoding_dim} and L1 value {l1_value}: {len(global_selected_features)}")

        # Store final selected features for this encoding dimension
        selected_features_per_dim[encoding_dim] = global_selected_features

        # Final reduced dataset
        final_selected_features = list(global_selected_features)
        reduced_data = data.iloc[:, final_selected_features]  # Final reduced dataset

        # Store results
        key = f"EncDim={encoding_dim} L1value={l1_value}"
        results[key] = {
            "train_loss": avg_train_loss,
            "val_loss": avg_val_loss,
            "selected_features": reduced_data.columns,
            "reduced_dataset": reduced_data
        }

        # Plot loss curves for each fold
        plt.figure(figsize=(10, 6))
        for i, history in enumerate(fold_histories):
            plt.plot(history['loss'], label=f"Train Fold {i+1}", linestyle='dashed', alpha=0.7)
            plt.plot(history['val_loss'], label=f"Val Fold {i+1}", linestyle='dotted', alpha=0.7)

        plt.xlabel("Epochs")
        plt.ylabel("Loss (MSE)")
        plt.title(f"Training vs. Validation Loss per Fold (EncDim={encoding_dim}, L1={l1_value})")
        plt.legend()

        # Save the plot
        plot_filename = f"H3K27ac_folds_loss_EncDim{encoding_dim}, L1={l1_value}.png" #Replace "H3K27ac" with the histone mark you select
        plt.savefig(plot_filename)
        plt.close()


Processing Encoding Dimension: 5 L1 value: 1e-06
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 31
  Fold 2/5
Best Threshold for Fold 2: 0.7, Features: 99
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 54
  Fold 4/5
Best Threshold for Fold 4: 0.7, Features: 96
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 70

Final Avg Training Loss: 0.042047
Final Avg Validation Loss: 0.046267

Best Global Threshold for Encoding Dimension 5 and L1 value 1e-06: 0.8

Final Selected Features for Encoding Dimension 5 and L1 value 1e-06: 171


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 5 L1 value: 1e-05
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 83
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 18
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 38
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 12
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 88

Final Avg Training Loss: 0.070134
Final Avg Validation Loss: 0.072134

Best Global Threshold for Encoding Dimension 5 and L1 value 1e-05: 0.9

Final Selected Features for Encoding Dimension 5 and L1 value 1e-05: 88


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 5 L1 value: 0.0001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 20
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 90
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 64
  Fold 4/5
Best Threshold for Fold 4: 0.8, Features: 51
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 99

Final Avg Training Loss: 0.098868
Final Avg Validation Loss: 0.099070

Best Global Threshold for Encoding Dimension 5 and L1 value 0.0001: 0.8

Final Selected Features for Encoding Dimension 5 and L1 value 0.0001: 455


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 5 L1 value: 0.001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 12
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 19
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 27
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 21
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 14

Final Avg Training Loss: 0.106063
Final Avg Validation Loss: 0.106132

Best Global Threshold for Encoding Dimension 5 and L1 value 0.001: 0.9

Final Selected Features for Encoding Dimension 5 and L1 value 0.001: 93


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 5 L1 value: 0.01
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 57
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 39
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 27
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 17
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 28

Final Avg Training Loss: 0.106039
Final Avg Validation Loss: 0.105971

Best Global Threshold for Encoding Dimension 5 and L1 value 0.01: 0.9

Final Selected Features for Encoding Dimension 5 and L1 value 0.01: 114


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 10 L1 value: 1e-06
  Fold 1/5
Best Threshold for Fold 1: 0.7, Features: 52
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 39
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 24
  Fold 4/5
Best Threshold for Fold 4: 0.7, Features: 47
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 18

Final Avg Training Loss: 0.037506
Final Avg Validation Loss: 0.043915

Best Global Threshold for Encoding Dimension 10 and L1 value 1e-06: 0.7

Final Selected Features for Encoding Dimension 10 and L1 value 1e-06: 1249


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 10 L1 value: 1e-05
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 21
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 25
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 91
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 19
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 73

Final Avg Training Loss: 0.051675
Final Avg Validation Loss: 0.054706

Best Global Threshold for Encoding Dimension 10 and L1 value 1e-05: 0.9

Final Selected Features for Encoding Dimension 10 and L1 value 1e-05: 84


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 10 L1 value: 0.0001
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 87
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 8
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 8
  Fold 4/5
Best Threshold for Fold 4: 0.8, Features: 43
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 21

Final Avg Training Loss: 0.098911
Final Avg Validation Loss: 0.098850

Best Global Threshold for Encoding Dimension 10 and L1 value 0.0001: 0.9

Final Selected Features for Encoding Dimension 10 and L1 value 0.0001: 44


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 10 L1 value: 0.001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 8
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 9
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 34
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 49
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 72

Final Avg Training Loss: 0.106087
Final Avg Validation Loss: 0.106048

Best Global Threshold for Encoding Dimension 10 and L1 value 0.001: 0.9

Final Selected Features for Encoding Dimension 10 and L1 value 0.001: 104


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 10 L1 value: 0.01
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 29
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 10
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 14
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 15
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 44

Final Avg Training Loss: 0.106036
Final Avg Validation Loss: 0.105960

Best Global Threshold for Encoding Dimension 10 and L1 value 0.01: 0.9

Final Selected Features for Encoding Dimension 10 and L1 value 0.01: 111


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 20 L1 value: 1e-06
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 80
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 86
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 14
  Fold 4/5
Best Threshold for Fold 4: 0.8, Features: 24
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 18

Final Avg Training Loss: 0.035497
Final Avg Validation Loss: 0.043268

Best Global Threshold for Encoding Dimension 20 and L1 value 1e-06: 0.8

Final Selected Features for Encoding Dimension 20 and L1 value 1e-06: 422


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 20 L1 value: 1e-05
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 31
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 52
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 18
  Fold 4/5
Best Threshold for Fold 4: 0.8, Features: 14
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 94

Final Avg Training Loss: 0.047335
Final Avg Validation Loss: 0.051632

Best Global Threshold for Encoding Dimension 20 and L1 value 1e-05: 0.8

Final Selected Features for Encoding Dimension 20 and L1 value 1e-05: 581


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 20 L1 value: 0.0001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 19
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 15
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 26
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 25
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 20

Final Avg Training Loss: 0.106027
Final Avg Validation Loss: 0.105970

Best Global Threshold for Encoding Dimension 20 and L1 value 0.0001: 0.9

Final Selected Features for Encoding Dimension 20 and L1 value 0.0001: 105


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 20 L1 value: 0.001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 35
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 11
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 28
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 29
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 23

Final Avg Training Loss: 0.106054
Final Avg Validation Loss: 0.106112

Best Global Threshold for Encoding Dimension 20 and L1 value 0.001: 0.9

Final Selected Features for Encoding Dimension 20 and L1 value 0.001: 125


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 20 L1 value: 0.01
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 24
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 17
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 41
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 40
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 40

Final Avg Training Loss: 0.106107
Final Avg Validation Loss: 0.106069

Best Global Threshold for Encoding Dimension 20 and L1 value 0.01: 0.9

Final Selected Features for Encoding Dimension 20 and L1 value 0.01: 161


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 50 L1 value: 1e-06
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 14
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 25
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 77
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 18
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 18

Final Avg Training Loss: 0.033660
Final Avg Validation Loss: 0.042546

Best Global Threshold for Encoding Dimension 50 and L1 value 1e-06: 0.8

Final Selected Features for Encoding Dimension 50 and L1 value 1e-06: 223


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 50 L1 value: 1e-05
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 15
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 18
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 52
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 8
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 14

Final Avg Training Loss: 0.041602
Final Avg Validation Loss: 0.047720

Best Global Threshold for Encoding Dimension 50 and L1 value 1e-05: 0.9

Final Selected Features for Encoding Dimension 50 and L1 value 1e-05: 47


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 50 L1 value: 0.0001
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 25
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 29
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 64
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 19
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 27

Final Avg Training Loss: 0.098036
Final Avg Validation Loss: 0.098434

Best Global Threshold for Encoding Dimension 50 and L1 value 0.0001: 0.8

Final Selected Features for Encoding Dimension 50 and L1 value 0.0001: 427


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 50 L1 value: 0.001
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 26
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 16
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 32
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 28
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 7

Final Avg Training Loss: 0.106172
Final Avg Validation Loss: 0.106023

Best Global Threshold for Encoding Dimension 50 and L1 value 0.001: 0.9

Final Selected Features for Encoding Dimension 50 and L1 value 0.001: 84


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 50 L1 value: 0.01
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 9
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 32
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 6
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 26
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 23

Final Avg Training Loss: 0.106052
Final Avg Validation Loss: 0.106041

Best Global Threshold for Encoding Dimension 50 and L1 value 0.01: 0.9

Final Selected Features for Encoding Dimension 50 and L1 value 0.01: 95


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 100 L1 value: 1e-06
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 55
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 90
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 80
  Fold 4/5
Best Threshold for Fold 4: 0.8, Features: 78
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 23

Final Avg Training Loss: 0.032943
Final Avg Validation Loss: 0.042386

Best Global Threshold for Encoding Dimension 100 and L1 value 1e-06: 0.8

Final Selected Features for Encoding Dimension 100 and L1 value 1e-06: 292


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 100 L1 value: 1e-05
  Fold 1/5
Best Threshold for Fold 1: 0.8, Features: 16
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 13
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 13
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 6
  Fold 5/5
Best Threshold for Fold 5: 0.7, Features: 65

Final Avg Training Loss: 0.042032
Final Avg Validation Loss: 0.048167

Best Global Threshold for Encoding Dimension 100 and L1 value 1e-05: 0.8

Final Selected Features for Encoding Dimension 100 and L1 value 1e-05: 250


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 100 L1 value: 0.0001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 43
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 10
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 18
  Fold 4/5
Best Threshold for Fold 4: 0.8, Features: 96
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 23

Final Avg Training Loss: 0.098063
Final Avg Validation Loss: 0.098266

Best Global Threshold for Encoding Dimension 100 and L1 value 0.0001: 0.9

Final Selected Features for Encoding Dimension 100 and L1 value 0.0001: 105


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 100 L1 value: 0.001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 42
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 11
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 29
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 9
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 51

Final Avg Training Loss: 0.106050
Final Avg Validation Loss: 0.106003

Best Global Threshold for Encoding Dimension 100 and L1 value 0.001: 0.9

Final Selected Features for Encoding Dimension 100 and L1 value 0.001: 140


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 100 L1 value: 0.01
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 19
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 32
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 25
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 14
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 50

Final Avg Training Loss: 0.106104
Final Avg Validation Loss: 0.106073

Best Global Threshold for Encoding Dimension 100 and L1 value 0.01: 0.9

Final Selected Features for Encoding Dimension 100 and L1 value 0.01: 140


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 500 L1 value: 1e-06
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 19
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 36
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 15
  Fold 4/5
Best Threshold for Fold 4: 0.8, Features: 46
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 37

Final Avg Training Loss: 0.032846
Final Avg Validation Loss: 0.042621

Best Global Threshold for Encoding Dimension 500 and L1 value 1e-06: 0.8

Final Selected Features for Encoding Dimension 500 and L1 value 1e-06: 319


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 500 L1 value: 1e-05
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 15
  Fold 2/5
Best Threshold for Fold 2: 0.8, Features: 81
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 9
  Fold 4/5
Best Threshold for Fold 4: 0.8, Features: 33
  Fold 5/5
Best Threshold for Fold 5: 0.7, Features: 29

Final Avg Training Loss: 0.044037
Final Avg Validation Loss: 0.050938

Best Global Threshold for Encoding Dimension 500 and L1 value 1e-05: 0.9

Final Selected Features for Encoding Dimension 500 and L1 value 1e-05: 37


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 500 L1 value: 0.0001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 58
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 16
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 75
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 10
  Fold 5/5
Best Threshold for Fold 5: 0.8, Features: 18

Final Avg Training Loss: 0.106019
Final Avg Validation Loss: 0.106089

Best Global Threshold for Encoding Dimension 500 and L1 value 0.0001: 0.9

Final Selected Features for Encoding Dimension 500 and L1 value 0.0001: 92


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 500 L1 value: 0.001
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 11
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 33
  Fold 3/5
Best Threshold for Fold 3: 0.8, Features: 20
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 24
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 8

Final Avg Training Loss: 0.106124
Final Avg Validation Loss: 0.106083

Best Global Threshold for Encoding Dimension 500 and L1 value 0.001: 0.9

Final Selected Features for Encoding Dimension 500 and L1 value 0.001: 77


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)



Processing Encoding Dimension: 500 L1 value: 0.01
  Fold 1/5
Best Threshold for Fold 1: 0.9, Features: 26
  Fold 2/5
Best Threshold for Fold 2: 0.9, Features: 30
  Fold 3/5
Best Threshold for Fold 3: 0.9, Features: 11
  Fold 4/5
Best Threshold for Fold 4: 0.9, Features: 40
  Fold 5/5
Best Threshold for Fold 5: 0.9, Features: 17

Final Avg Training Loss: 0.106092
Final Avg Validation Loss: 0.106103

Best Global Threshold for Encoding Dimension 500 and L1 value 0.01: 0.9

Final Selected Features for Encoding Dimension 500 and L1 value 0.01: 123


  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)
  plt.savefig(plot_filename)


In [7]:
results

{'EncDim=5 L1value=1e-06': {'train_loss': 0.04204738661646843,
  'val_loss': 0.04626746922731399,
  'selected_features': Index(['chr16_85994517_85997209_n_7687', 'chr2_31217928_31220819_n_10248',
         'chr10_7469914_7472961_n_2067', 'chr19_932187_935654_n_9238',
         'chr22_46659525_46662977_n_12828', 'chr17_42365825_42368555_n_8223',
         'chr3_59992714_59997423_n_13347', 'chr17_42891380_42895676_n_8230',
         'chr3_63932544_63936182_n_13358', 'chr3_64341753_64346095_n_13365',
         ...
         'chr22_42297470_42301861_n_12773', 'chr7_127075_130288_n_16868',
         'chr15_67023465_67026882_n_6635', 'chr9_93082136_93084751_n_18929',
         'chr10_80525765_80530432_n_2545', 'chr14_22835733_22839911_n_5618',
         'chr6_85448834_85452100_n_16373', 'chr3_53700458_53704657_n_13302',
         'chr11_131736165_131740391_n_4093', 'chr20_36783901_36787136_n_11776'],
        dtype='object', length=171),
  'reduced_dataset':       chr16_85994517_85997209_n_7687  chr2_3

In [8]:
for i in results:
    print(i)
    print(f"Training Loss: {results[i]['train_loss']}")
    print(f"Validation Loss: {results[i]['val_loss']}")
    print('\n')

EncDim=5 L1value=1e-06
Training Loss: 0.04204738661646843
Validation Loss: 0.04626746922731399


EncDim=5 L1value=1e-05
Training Loss: 0.07013392895460129
Validation Loss: 0.07213355228304863


EncDim=5 L1value=0.0001
Training Loss: 0.09886766672134399
Validation Loss: 0.09906955808401108


EncDim=5 L1value=0.001
Training Loss: 0.106062513589859
Validation Loss: 0.10613162219524383


EncDim=5 L1value=0.01
Training Loss: 0.10603883266448974
Validation Loss: 0.10597103238105773


EncDim=10 L1value=1e-06
Training Loss: 0.03750603795051575
Validation Loss: 0.04391475021839142


EncDim=10 L1value=1e-05
Training Loss: 0.051674749702215195
Validation Loss: 0.054706064611673356


EncDim=10 L1value=0.0001
Training Loss: 0.09891100972890854
Validation Loss: 0.09884980469942092


EncDim=10 L1value=0.001
Training Loss: 0.10608667284250259
Validation Loss: 0.10604813992977143


EncDim=10 L1value=0.01
Training Loss: 0.10603626370429993
Validation Loss: 0.10595978945493698


EncDim=20 L1value=1e-06
T

In [10]:
all_loss_histories

{'EncDim=5 L1value=1e-06': (array([0.15393648, 0.1042318 , 0.09798113, 0.08539773, 0.07848006,
         0.07165575, 0.06638201, 0.06324643, 0.06086325, 0.05892287,
         0.05769101, 0.0565534 , 0.05560579, 0.05494611, 0.05421328,
         0.05353459, 0.05314449, 0.05255316, 0.05227755, 0.05188038,
         0.05163102, 0.05106128, 0.0508298 , 0.05061612, 0.05016169,
         0.05002362, 0.04971667, 0.04946263, 0.04917823, 0.04887372,
         0.04885867, 0.04870335, 0.04837318, 0.04802857, 0.04792997,
         0.0479123 , 0.04754253, 0.04729857, 0.04723345, 0.04714087,
         0.04700895, 0.0469832 , 0.04663874, 0.04646644, 0.04630832,
         0.0462205 , 0.04608575, 0.04613702, 0.04620341, 0.04602993,
         0.04583198, 0.04554809, 0.0455455 , 0.04523227, 0.04526787,
         0.04563089, 0.04537615, 0.04499817, 0.04494148, 0.04484205,
         0.04475423, 0.04460264, 0.04464785, 0.04436766, 0.04438452,
         0.04441081, 0.04419943, 0.04388903, 0.04381075, 0.04399755,
        