#### settings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/test2

/content/drive/MyDrive/test2


In [None]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mutual_info_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from itertools import product

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.feature_selection import mutual_info_regression

from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
from kerastuner import HyperModel, RandomSearch

  from kerastuner import HyperModel, RandomSearch


## 11.20 - res - standard scaler (method 1)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mutual_info_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from itertools import product
from sklearn.feature_selection import mutual_info_regression
from kerastuner import HyperModel, RandomSearch

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_residential.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = StandardScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [8, 12, 16]
encoding_dims_options = [1, 2, 3, 4, 5]
results = []


# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 8, Encoding dimension: 5, Neurons: 16, Test loss: 0.4799385964870453


Hidden layers: 6, Encoding dimension: 4, Neurons: 16, Test loss: 0.48775815963745117


In [None]:
from kerastuner import Hyperband
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Int('encoding_dim', min_value=2, max_value=5, step=1)
    neurons = hp.Int('neurons', min_value=6, max_value=21, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)
    bottleneck = Dense(encoding_dim, activation=activation, name='bottleneck')(x)
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

In [None]:
from kerastuner.tuners import Hyperband


tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=4, hidden_layers_after=4),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning1__0__0', project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning2__0__0', project_name='model_config_2'
)

In [None]:
# Run the tuning for configuration 1
tuner_config_1.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 1
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]


Trial 90 Complete [00h 00m 29s]
val_loss: 0.4627886712551117

Best val_loss So Far: 0.3832147717475891
Total elapsed time: 00h 27m 28s


In [None]:
# Run the tuning for configuration 2
tuner_config_2.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 2
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

Trial 90 Complete [00h 00m 42s]
val_loss: 0.9605245590209961

Best val_loss So Far: 0.410219669342041
Total elapsed time: 00h 47m 40s


#### regularization

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 12, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0068'}

Test Loss: 0.3832147717475891
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step

Latent Feature Correlations:
[[ 1.          0.30797033  0.54945184  0.0214523   0.25289563]
 [ 0.30797033  1.          0.58627533  0.14808727  0.30258413]
 [ 0.54945184  0.58627533  1.          0.02314082  0.02965746]
 [ 0.0214523   0.14808727  0.02314082  1.         -0.34512566]
 [ 0.25289563  0.30258413  0.02965746 -0.34512566  1.        ]]


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_1.input, outputs=best_model_config_1.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_1.layers if 'dense_decoder' in l.name]:
    x = best_model_config_1.get_layer(layer_name)(x)
decoder_output = best_model_config_1.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#calculate bottleneck contributions
def calculate_bottleneck_contributions(autoencoder, encoder, decoder, X):
    bottleneck_output = encoder.predict(X)
    contributions = []

    for dim in range(bottleneck_output.shape[1]):
        # Isolate one dimension at a time
        isolated_latent = np.zeros_like(bottleneck_output)
        isolated_latent[:, dim] = bottleneck_output[:, dim]

        # Reconstruct input using only the isolated latent dimension
        reconstructed = decoder.predict(isolated_latent)

        # Compute reconstruction loss (e.g., MSE)
        loss = mean_squared_error(X, reconstructed)
        contributions.append(loss)

    # Normalize contributions
    contributions = np.array(contributions)
    normalized_contributions = 1 - (contributions / np.sum(contributions))

    return normalized_contributions

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7954
Latent Dimension 2: 0.8117
Latent Dimension 3: 0.7506
Latent Dimension 4: 0.8516
Latent Dimension 5: 0.7908


In [None]:
# Normalize the contributions so they sum to 1
def normalize_contributions(contributions):
    total = sum(contributions)
    return [c / total for c in contributions]

# Calculate contributions
#contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.1989
Latent Dimension 2: 0.2029
Latent Dimension 3: 0.1876
Latent Dimension 4: 0.2129
Latent Dimension 5: 0.1977
Sum of Normalized Contributions: 1.0


In [None]:
normalized_contributions

[0.19885344280211373,
 0.20292241596737712,
 0.18764130512713662,
 0.21288798019149158,
 0.197694855911881]

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

# Calculate MI scores
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
latent_contributions

array([0.19885344, 0.20292242, 0.18764131, 0.21288798, 0.19769486])

In [None]:
normalized_mi_scores

array([[0.07002899, 0.00512558, 0.01160491, 0.01829804, 0.0590255 ],
       [0.        , 0.02895349, 0.01594705, 0.03066119, 0.02398362],
       [0.03184994, 0.01501085, 0.01285349, 0.01868057, 0.08619057],
       [0.05063481, 0.07142526, 0.05096472, 0.0548023 , 0.09137574],
       [0.02449121, 0.02435147, 0.02120332, 0.05046647, 0.04469847],
       [0.03257792, 0.06110832, 0.02130724, 0.0336327 , 0.04084748],
       [0.01230042, 0.00709941, 0.02275787, 0.06576877, 0.02594564],
       [0.11578865, 0.07002974, 0.10147225, 0.0382213 , 0.00994035],
       [0.09259996, 0.11402428, 0.11376249, 0.08644002, 0.07510392],
       [0.04747113, 0.05511851, 0.0917666 , 0.03027567, 0.00827904],
       [0.03668028, 0.04834172, 0.03867838, 0.09723794, 0.03907018],
       [0.01205423, 0.03218533, 0.05426946, 0.0656523 , 0.01717193],
       [0.05172083, 0.03377742, 0.04068113, 0.01923531, 0.01101608],
       [0.02902567, 0.01327519, 0.03033655, 0.07552114, 0.01370998],
       [0.10427439, 0.09261184, 0.

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Excel
df.to_excel("real_org_se_AUTOENCODER_best_auto_config1_1215_with_weights_res_ss.xlsx", index=False)

config 2

In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 4, 'neurons': 18, 'learning_rate': 0.01, 'batch_size': 64, 'activation': 'relu', 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

Test Loss: 0.410219669342041
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step

Latent Feature Correlations:
[[ 1.          0.29696144 -0.1367382   0.52200028]
 [ 0.29696144  1.         -0.16077114  0.74088074]
 [-0.1367382  -0.16077114  1.         -0.21152383]
 [ 0.52200028  0.74088074 -0.21152383  1.        ]]


In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_2.input, outputs=best_model_config_2.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_2.layers if 'dense_decoder' in l.name]:
    x = best_model_config_2.get_layer(layer_name)(x)
decoder_output = best_model_config_2.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_2, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.8363
Latent Dimension 2: 0.7287
Latent Dimension 3: 0.7776
Latent Dimension 4: 0.6574


In [None]:
# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.2788
Latent Dimension 2: 0.2429
Latent Dimension 3: 0.2592
Latent Dimension 4: 0.2191
Sum of Normalized Contributions: 1.0


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)  # Make sure this is normalized

mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 4)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_2.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_2.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Excel
df.to_excel("real_org_se_AUTOENCODER_best_auto_config2_1208_with_weights_res_ss.xlsx", index=False)

In [None]:
best_model_config_1.summary()

In [None]:
best_model_config_2.summary()

## res - minmax scaler  (method 1)

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_residential.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [8, 12, 16]
encoding_dims_options = [1, 2, 3, 4, 5]
results = []


# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

In [None]:
from kerastuner import Hyperband
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Int('encoding_dim', min_value=2, max_value=5, step=1)
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)
    bottleneck = Dense(encoding_dim, activation=activation, name='bottleneck')(x)
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

In [None]:
from kerastuner.tuners import Hyperband

# Define search space for each configuration
tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning_1_001_0_0', project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=4, hidden_layers_after=4),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning_2_001_0_0', project_name='model_config_2'
)

In [None]:
# Run the tuning for configuration 1
tuner_config_1.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 1
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]


Trial 90 Complete [00h 00m 11s]
val_loss: 0.04261470586061478

Best val_loss So Far: 0.015086108818650246
Total elapsed time: 00h 11m 21s


In [None]:
# Run the tuning for configuration 2
tuner_config_2.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 2
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

Trial 88 Complete [00h 00m 19s]
val_loss: 0.039806704968214035

Best val_loss So Far: 0.013620617799460888
Total elapsed time: 00h 14m 42s

Search: Running Trial #89

Value             |Best Value So Far |Hyperparameter
4                 |5                 |encoding_dim
12                |21                |neurons
0.0001            |0.01              |learning_rate
32                |32                |batch_size
relu              |tanh              |activation
50                |50                |tuner/epochs
0                 |17                |tuner/initial_epoch
0                 |3                 |tuner/bracket
0                 |3                 |tuner/round

Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 554ms/step - loss: 0.4000 - val_loss: 0.4131
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - loss: 0.3959 - val_loss: 0.4123
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - l

#### regularization

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 12, 'learning_rate': 0.01, 'batch_size': 64, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0049'}

Test Loss: 0.015086108818650246
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 150ms/step

Latent Feature Correlations:
[[ 1.         -0.16989383 -0.40071922 -0.31207698 -0.08475396]
 [-0.16989383  1.          0.50516099  0.37197808 -0.05713703]
 [-0.40071922  0.50516099  1.          0.87811357 -0.23301824]
 [-0.31207698  0.37197808  0.87811357  1.         -0.11183346]
 [-0.08475396 -0.05713703 -0.23301824 -0.11183346  1.        ]]


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_1.input, outputs=best_model_config_1.get_layer('bottleneck').output)

# Create a decoder model
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_1.layers if 'dense_decoder' in l.name]:
    x = best_model_config_1.get_layer(layer_name)(x)
decoder_output = best_model_config_1.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#calculate bottleneck contributions
def calculate_bottleneck_contributions(autoencoder, encoder, decoder, X):
    bottleneck_output = encoder.predict(X)
    contributions = []

    for dim in range(bottleneck_output.shape[1]):
        # Isolate one dimension at a time
        isolated_latent = np.zeros_like(bottleneck_output)
        isolated_latent[:, dim] = bottleneck_output[:, dim]

        # Reconstruct input using only the isolated latent dimension
        reconstructed = decoder.predict(isolated_latent)

        # Compute reconstruction loss (e.g., MSE)
        loss = mean_squared_error(X, reconstructed)
        contributions.append(loss)

    # Normalize contributions
    contributions = np.array(contributions)
    normalized_contributions = 1 - (contributions / np.sum(contributions))

    return normalized_contributions

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")



[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 216ms/step



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 75ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7761
Latent Dimension 2: 0.8138
Latent Dimension 3: 0.7645
Latent Dimension 4: 0.8158
Latent Dimension 5: 0.8297


In [None]:
# Normalize the contributions so they sum to 1
def normalize_contributions(contributions):
    total = sum(contributions)
    return [c / total for c in contributions]

# Calculate contributions
#contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.1940
Latent Dimension 2: 0.2035
Latent Dimension 3: 0.1911
Latent Dimension 4: 0.2040
Latent Dimension 5: 0.2074
Sum of Normalized Contributions: 1.0


In [None]:
normalized_contributions

[0.19403405904532117,
 0.20345870938985638,
 0.1911372112227145,
 0.2039520857023381,
 0.20741793463976987]

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

# Calculate MI scores
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
latent_contributions

array([0.19403406, 0.20345871, 0.19113721, 0.20395209, 0.20741793])

In [None]:
normalized_mi_scores

array([[0.05455457, 0.02020102, 0.02092671, 0.02279347, 0.07840452],
       [0.0094563 , 0.00288066, 0.01862264, 0.03112839, 0.03553254],
       [0.05116817, 0.02504367, 0.01868611, 0.01812341, 0.02813357],
       [0.13456826, 0.06264247, 0.03143379, 0.03352727, 0.03829521],
       [0.00183345, 0.04526236, 0.01010196, 0.00733088, 0.02467917],
       [0.01498898, 0.04531716, 0.04054242, 0.01223486, 0.03305241],
       [0.01799518, 0.01839382, 0.04540517, 0.0485257 , 0.03162676],
       [0.03910827, 0.        , 0.07709839, 0.13970399, 0.00541697],
       [0.07066447, 0.05032809, 0.13204969, 0.1717924 , 0.10998619],
       [0.04184259, 0.01340253, 0.03213333, 0.04228886, 0.06556817],
       [0.01984796, 0.03739627, 0.07937835, 0.05664472, 0.10804157],
       [0.02921321, 0.02480861, 0.07088309, 0.0564372 , 0.04079494],
       [0.04505863, 0.01419747, 0.03541828, 0.03484762, 0.02077405],
       [0.01396048, 0.02919417, 0.0515004 , 0.04382218, 0.03613565],
       [0.12917578, 0.01509085, 0.

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_org_se_AUTOENCODER_best_auto_config1_1215_with_weights_res_mm.xlsx", index=False)

config 2

In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 21, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0046'}

Test Loss: 0.013620617799460888
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step

Latent Feature Correlations:
[[ 1.          0.33023841 -0.27611846 -0.39997989  0.13483466]
 [ 0.33023841  1.         -0.27150449 -0.67782445 -0.13697377]
 [-0.27611846 -0.27150449  1.          0.49294685 -0.21667334]
 [-0.39997989 -0.67782445  0.49294685  1.          0.05789381]
 [ 0.13483466 -0.13697377 -0.21667334  0.05789381  1.        ]]


In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_2.input, outputs=best_model_config_2.get_layer('bottleneck').output)

# Create a decoder model
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_2.layers if 'dense_decoder' in l.name]:
    x = best_model_config_2.get_layer(layer_name)(x)
decoder_output = best_model_config_2.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_2, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7740
Latent Dimension 2: 0.8088
Latent Dimension 3: 0.7952
Latent Dimension 4: 0.8121
Latent Dimension 5: 0.8098


In [None]:
# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.1935
Latent Dimension 2: 0.2022
Latent Dimension 3: 0.1988
Latent Dimension 4: 0.2030
Latent Dimension 5: 0.2025
Sum of Normalized Contributions: 1.0


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_2.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_2.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Excel
df.to_excel("real_org_se_AUTOENCODER_best_auto_config2_1215_with_weights_res_mm.xlsx", index=False)

In [None]:
best_model_config_1.summary()

In [None]:
best_model_config_2.summary()

##  mod - standard scaler (method 1)

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_mod.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = StandardScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [8, 12, 16]
encoding_dims_options = [1, 2, 3, 4, 5]
results = []


# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 6, Encoding dimension: 5, Neurons: 16, Test loss: 0.03019469417631626


Hidden layers: 8, Encoding dimension: 4, Neurons: 16, Test loss: 0.03101118840277195


In [None]:
from kerastuner import Hyperband
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Int('encoding_dim', min_value=2, max_value=5, step=1)
    neurons = hp.Int('neurons', min_value=6, max_value=21, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)
    bottleneck = Dense(encoding_dim, activation=activation, name='bottleneck')(x)
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

In [None]:
from kerastuner.tuners import Hyperband

# Define search space for each configuration
tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam__tun_1_00_001', project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=4, hidden_layers_after=4),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam__tuni2_00_001', project_name='model_config_2'
)

In [None]:
# Run the tuning for configuration 1
tuner_config_1.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 1
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]


Trial 90 Complete [00h 00m 14s]
val_loss: 0.12762096524238586

Best val_loss So Far: 0.12762096524238586
Total elapsed time: 00h 12m 36s


In [None]:
# Run the tuning for configuration 2
tuner_config_2.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 2
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

Trial 90 Complete [00h 00m 20s]
val_loss: 0.5696251392364502

Best val_loss So Far: 0.1425689309835434
Total elapsed time: 00h 19m 47s


#### regularization

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 15, 'learning_rate': 0.01, 'batch_size': 16, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

Test Loss: 0.12762096524238586
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step

Latent Feature Correlations:
[[ 1.         -0.42522991 -0.55274599  0.71814241 -0.67488728]
 [-0.42522991  1.         -0.0051193  -0.51272477  0.24704451]
 [-0.55274599 -0.0051193   1.         -0.52047143  0.61996209]
 [ 0.71814241 -0.51272477 -0.52047143  1.         -0.56127025]
 [-0.67488728  0.24704451  0.61996209 -0.56127025  1.        ]]


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_1.input, outputs=best_model_config_1.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_1.layers if 'dense_decoder' in l.name]:
    x = best_model_config_1.get_layer(layer_name)(x)
decoder_output = best_model_config_1.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#calculate bottleneck contributions
def calculate_bottleneck_contributions(autoencoder, encoder, decoder, X):
    bottleneck_output = encoder.predict(X)
    contributions = []

    for dim in range(bottleneck_output.shape[1]):
        # Isolate one dimension at a time
        isolated_latent = np.zeros_like(bottleneck_output)
        isolated_latent[:, dim] = bottleneck_output[:, dim]

        # Reconstruct input using only the isolated latent dimension
        reconstructed = decoder.predict(isolated_latent)

        # Compute reconstruction loss (e.g., MSE)
        loss = mean_squared_error(X, reconstructed)
        contributions.append(loss)

    # Normalize contributions
    contributions = np.array(contributions)
    normalized_contributions = 1 - (contributions / np.sum(contributions))

    return normalized_contributions

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.8027
Latent Dimension 2: 0.8090
Latent Dimension 3: 0.7978
Latent Dimension 4: 0.7832
Latent Dimension 5: 0.8073


In [None]:
# Normalize the contributions so they sum to 1
def normalize_contributions(contributions):
    total = sum(contributions)
    return [c / total for c in contributions]

# Calculate contributions
#contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

In [None]:
# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.2007
Latent Dimension 2: 0.2023
Latent Dimension 3: 0.1995
Latent Dimension 4: 0.1958
Latent Dimension 5: 0.2018
Sum of Normalized Contributions: 1.0


In [None]:
normalized_contributions

[0.20068282212358485,
 0.20225030018813622,
 0.19946112743781164,
 0.19579165287560096,
 0.2018140973748663]

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

# Calculate MI scores
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
latent_contributions

array([0.20068282, 0.2022503 , 0.19946113, 0.19579165, 0.2018141 ])

In [None]:
normalized_mi_scores

array([[0.04506836, 0.0442941 , 0.03821024, 0.05480771, 0.04213401],
       [0.03732239, 0.0443199 , 0.03801271, 0.03653718, 0.03931993],
       [0.03767802, 0.05149054, 0.03826431, 0.04143186, 0.03620406],
       [0.0621992 , 0.04103238, 0.04394444, 0.03890111, 0.03970796],
       [0.0565258 , 0.04171799, 0.03636303, 0.03285133, 0.04440906],
       [0.0380685 , 0.05298172, 0.04176086, 0.04295108, 0.04307252],
       [0.03529352, 0.04076955, 0.03538346, 0.04083153, 0.05097274],
       [0.04234658, 0.03983225, 0.03658004, 0.04786812, 0.04423753],
       [0.05817101, 0.04866389, 0.04843819, 0.04970414, 0.06481478],
       [0.05393561, 0.0419879 , 0.05222198, 0.04296094, 0.04250453],
       [0.0389426 , 0.04162132, 0.05199782, 0.04226989, 0.05834244],
       [0.00302001, 0.01051616, 0.01493002, 0.01740761, 0.02912733],
       [0.03808041, 0.04728073, 0.04326012, 0.05573281, 0.03475857],
       [0.037277  , 0.04498025, 0.04421609, 0.04141471, 0.04149851],
       [0.06231611, 0.04327458, 0.

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Excel
df.to_excel("real_org_se_AUTOENCODER_best_auto_config1_1208_with_weights_mod_ss.xlsx", index=False)

config 2

In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 3, 'neurons': 21, 'learning_rate': 0.01, 'batch_size': 64, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0046'}

Test Loss: 0.1425689309835434
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step

Latent Feature Correlations:
[[ 1.          0.24885691  0.02348047]
 [ 0.24885691  1.         -0.54123424]
 [ 0.02348047 -0.54123424  1.        ]]


In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_2.input, outputs=best_model_config_2.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_2.layers if 'dense_decoder' in l.name]:
    x = best_model_config_2.get_layer(layer_name)(x)
decoder_output = best_model_config_2.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_2, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.6339
Latent Dimension 2: 0.7015
Latent Dimension 3: 0.6645


In [None]:
# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.3170
Latent Dimension 2: 0.3508
Latent Dimension 3: 0.3323
Sum of Normalized Contributions: 1.0


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 3)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_2.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_2.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_org_se_AUTOENCODER_best_auto_config2_1215_with_weights_mod_ss.xlsx", index=False)

In [None]:
best_model_config_1.summary()

In [None]:
best_model_config_2.summary()

## mod - minmax scaler (method 1)

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_mod.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X_ori)

# Split the data into training and test sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [8, 12, 16]
encoding_dims_options = [1, 2, 3, 4, 5]
results = []


# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 6, Encoding dimension: 5, Neurons: 16, Test loss: 0.03019469417631626


Hidden layers: 8, Encoding dimension: 4, Neurons: 16, Test loss: 0.03101118840277195


In [None]:
from kerastuner import Hyperband
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Int('encoding_dim', min_value=2, max_value=5, step=1)
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)
    bottleneck = Dense(encoding_dim, activation=activation, name='bottleneck')(x)
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

In [None]:
from kerastuner.tuners import Hyperband

# Define search space for each configuration
tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tun_1_01', project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=2, hidden_layers_after=2),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuni_2_01', project_name='model_config_2'
)

In [None]:
# Run the tuning for configuration 1
tuner_config_1.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 1
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]


Trial 90 Complete [00h 00m 33s]
val_loss: 0.1942932903766632

Best val_loss So Far: 0.008733450435101986
Total elapsed time: 00h 32m 35s


In [None]:
# Run the tuning for configuration 2
tuner_config_2.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 2
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

Trial 90 Complete [00h 00m 30s]
val_loss: 0.024620352312922478

Best val_loss So Far: 0.008665623143315315
Total elapsed time: 00h 33m 54s


#### regularization

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 21, 'learning_rate': 0.01, 'batch_size': 16, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0046'}

Test Loss: 0.008733450435101986
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step

Latent Feature Correlations:
[[ 1.          0.27341915  0.5457976  -0.61524119 -0.55066915]
 [ 0.27341915  1.         -0.1533339  -0.29539084 -0.0314978 ]
 [ 0.5457976  -0.1533339   1.         -0.48274557 -0.38345682]
 [-0.61524119 -0.29539084 -0.48274557  1.          0.8591797 ]
 [-0.55066915 -0.0314978  -0.38345682  0.8591797   1.        ]]


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_1.input, outputs=best_model_config_1.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_1.layers if 'dense_decoder' in l.name]:
    x = best_model_config_1.get_layer(layer_name)(x)
decoder_output = best_model_config_1.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#calculate bottleneck contributions
def calculate_bottleneck_contributions(autoencoder, encoder, decoder, X):
    bottleneck_output = encoder.predict(X)
    contributions = []

    for dim in range(bottleneck_output.shape[1]):
        # Isolate one dimension at a time
        isolated_latent = np.zeros_like(bottleneck_output)
        isolated_latent[:, dim] = bottleneck_output[:, dim]

        # Reconstruct input using only the isolated latent dimension
        reconstructed = decoder.predict(isolated_latent)

        # Compute reconstruction loss (e.g., MSE)
        loss = mean_squared_error(X, reconstructed)
        contributions.append(loss)

    # Normalize contributions
    contributions = np.array(contributions)
    normalized_contributions = 1 - (contributions / np.sum(contributions))

    return normalized_contributions

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7679
Latent Dimension 2: 0.7455
Latent Dimension 3: 0.7569
Latent Dimension 4: 0.8751
Latent Dimension 5: 0.8545


In [None]:
# Normalize the contributions so they sum to 1
def normalize_contributions(contributions):
    total = sum(contributions)
    return [c / total for c in contributions]

# Calculate contributions
#contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.1920
Latent Dimension 2: 0.1864
Latent Dimension 3: 0.1892
Latent Dimension 4: 0.2188
Latent Dimension 5: 0.2136
Sum of Normalized Contributions: 1.0


In [None]:
normalized_contributions

[0.19197492579607112,
 0.1863806563451278,
 0.1892238559408974,
 0.21878465384800216,
 0.21363590806990151]

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

# Calculate MI scores
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
latent_contributions

array([0.19197493, 0.18638066, 0.18922386, 0.21878465, 0.21363591])

In [None]:
normalized_mi_scores

array([[0.04218445, 0.03078654, 0.08440457, 0.03912206, 0.04635877],
       [0.03773566, 0.03483614, 0.03606974, 0.04068296, 0.04374035],
       [0.04748639, 0.03079026, 0.04163986, 0.03885202, 0.04129998],
       [0.04901207, 0.03740175, 0.05388761, 0.04287349, 0.03937062],
       [0.03882233, 0.02890048, 0.03574005, 0.03554469, 0.05043972],
       [0.04205442, 0.04417397, 0.04279947, 0.04223007, 0.04296923],
       [0.04711199, 0.04431725, 0.0378048 , 0.0408701 , 0.0430245 ],
       [0.03438571, 0.05573018, 0.04352765, 0.04014563, 0.03666145],
       [0.05761794, 0.08619784, 0.05250345, 0.06101956, 0.04347579],
       [0.04581677, 0.04777077, 0.05362266, 0.03607465, 0.04587461],
       [0.05226176, 0.05667895, 0.054738  , 0.05831268, 0.043774  ],
       [0.01581146, 0.03530167, 0.01370284, 0.02208394, 0.01011167],
       [0.02638561, 0.04146324, 0.03683826, 0.03817221, 0.03910067],
       [0.04795623, 0.04528865, 0.0380058 , 0.04363853, 0.04156155],
       [0.04916894, 0.04200865, 0.

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_org_se_AUTOENCODER_best_auto_config1_1208_with_weights_mod_mm.xlsx", index=False)

config 2

In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 18, 'learning_rate': 0.01, 'batch_size': 64, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0046'}

Test Loss: 0.008665623143315315
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step

Latent Feature Correlations:
[[ 1.          0.90913048 -0.62404316 -0.25591497  0.40747206]
 [ 0.90913048  1.         -0.73806515 -0.43251889  0.26207923]
 [-0.62404316 -0.73806515  1.          0.50456251 -0.28659933]
 [-0.25591497 -0.43251889  0.50456251  1.          0.08879892]
 [ 0.40747206  0.26207923 -0.28659933  0.08879892  1.        ]]


In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_2.input, outputs=best_model_config_2.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_2.layers if 'dense_decoder' in l.name]:
    x = best_model_config_2.get_layer(layer_name)(x)
decoder_output = best_model_config_2.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_2, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.8262
Latent Dimension 2: 0.8966
Latent Dimension 3: 0.7111
Latent Dimension 4: 0.7962
Latent Dimension 5: 0.7699


In [None]:
# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.2066
Latent Dimension 2: 0.2242
Latent Dimension 3: 0.1778
Latent Dimension 4: 0.1991
Latent Dimension 5: 0.1925
Sum of Normalized Contributions: 1.0


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)  # Make sure this is normalized

mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_2.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_2.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_org_se_AUTOENCODER_best_auto_config2_1208_with_weights_mod_mm.xlsx", index=False)

In [None]:
best_model_config_1.summary()

In [None]:
best_model_config_2.summary()

## res / only one encoding dimension - ss (method 2)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mutual_info_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from itertools import product
from sklearn.feature_selection import mutual_info_regression
from kerastuner import HyperModel, RandomSearch

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_residential.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = StandardScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [6, 8, 10, 12, 14, 16]
encoding_dims_options = [1]
results = []

# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 8, Encoding dimension: 1, Neurons: 14, Test loss: 0.6968121528625488


Hidden layers: 6, Encoding dimension: 1, Neurons: 12, Test loss: 0.7262334227561951


In [None]:
from kerastuner import Hyperband
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Choice('encoding_dim', [1]) # Only one encoding dimension
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)
    bottleneck = Dense(encoding_dim, activation=activation, name='bottleneck')(x)
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

In [None]:
from kerastuner.tuners import Hyperband

# Define search space for each configuration
tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=4, hidden_layers_after=4),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning__1__o_n_l_y1', project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning__2__o_n_ly_1', project_name='model_config_2'
)

In [None]:
# Run the tuning for configuration 1
tuner_config_1.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 1
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]


Trial 90 Complete [00h 00m 17s]
val_loss: 0.9786759614944458

Best val_loss So Far: 0.5522105693817139
Total elapsed time: 00h 20m 02s


In [None]:
# Run the tuning for configuration 2
tuner_config_2.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 2
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

Trial 90 Complete [00h 00m 22s]
val_loss: 0.9045199155807495

Best val_loss So Far: 0.5610660910606384
Total elapsed time: 00h 19m 51s


#### check

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 1, 'neurons': 15, 'learning_rate': 0.01, 'batch_size': 16, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0067'}

Test Loss: 0.5522105693817139
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step

Latent Feature Correlations:
1.0


In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 1, 'neurons': 21, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'relu', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0049'}

Test Loss: 0.3288326859474182
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step

Latent Feature Correlations:
1.0


#### summary

In [None]:
# Display the structure of the best model for configuration 1
print("Best Model Structure for Configuration 1:")
best_model_config_1.summary()

# Display the structure of the best model for configuration 2
print("\nBest Model Structure for Configuration 2:")
best_model_config_2.summary()

Best Model Structure for Configuration 1:



Best Model Structure for Configuration 2:


In [None]:
# Print details of each layer for the best model in configuration 1
print("Best Model for Configuration 1:")
for layer in best_model_config_1.layers:
    if hasattr(layer, 'output_shape'):
        output_shape = layer.output_shape
    else:
        output_shape = layer.get_output_shape_at(0) if hasattr(layer, 'get_output_shape_at') else 'N/A'
    print(f"Layer: {layer.name}, Type: {layer.__class__.__name__}, Output Shape: {output_shape}, Parameters: {layer.count_params()}")

# Print details of each layer for the best model in configuration 2
print("\nBest Model for Configuration 2:")
for layer in best_model_config_2.layers:
    if hasattr(layer, 'output_shape'):
        output_shape = layer.output_shape
    else:
        output_shape = layer.get_output_shape_at(0) if hasattr(layer, 'get_output_shape_at') else 'N/A'
    print(f"Layer: {layer.name}, Type: {layer.__class__.__name__}, Output Shape: {output_shape}, Parameters: {layer.count_params()}")

Best Model for Configuration 1:
Layer: input_layer, Type: InputLayer, Output Shape: N/A, Parameters: 0
Layer: dense_encoder_1, Type: Dense, Output Shape: N/A, Parameters: 375
Layer: dense_encoder_2, Type: Dense, Output Shape: N/A, Parameters: 240
Layer: dense_encoder_3, Type: Dense, Output Shape: N/A, Parameters: 240
Layer: dense_encoder_4, Type: Dense, Output Shape: N/A, Parameters: 240
Layer: bottleneck, Type: Dense, Output Shape: N/A, Parameters: 16
Layer: dense_decoder_1, Type: Dense, Output Shape: N/A, Parameters: 30
Layer: dense_decoder_2, Type: Dense, Output Shape: N/A, Parameters: 240
Layer: dense_decoder_3, Type: Dense, Output Shape: N/A, Parameters: 240
Layer: dense_decoder_4, Type: Dense, Output Shape: N/A, Parameters: 240
Layer: output_layer, Type: Dense, Output Shape: N/A, Parameters: 384

Best Model for Configuration 2:
Layer: input_layer, Type: InputLayer, Output Shape: N/A, Parameters: 0
Layer: dense_encoder_1, Type: Dense, Output Shape: N/A, Parameters: 525
Layer: dens

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


In [None]:
bottleneck_output_1.shape

(197, 1)

In [None]:
bottleneck_output_2.shape

(197, 1)

In [None]:
from sklearn.feature_selection import mutual_info_regression
import numpy as np
import pandas as pd

In [None]:
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)


In [None]:
mi_scores

[array([0.44647139, 0.2722489 , 0.26068733, 0.66430636, 0.22727116,
        0.3290235 , 0.34749443, 0.77091991, 1.11296172, 0.39529593,
        0.60572365, 0.47222822, 0.3359241 , 0.32804701, 0.62093041,
        0.72574801, 0.48289276, 0.10986328, 0.23489059, 0.75706334,
        0.61790683, 0.13459049, 0.20046701, 0.26493196])]

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 1)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

In [None]:
# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Final_Index
0,-1.599647,0.172556,-1.154431,-0.956162,-0.585631,0.832135,-3.356507,-0.410742,0.905227,0.011890,...,-1.017613,-0.148817,0.360666,0.042791,-0.379008,-0.068907,1.502159,-4.163087,-0.211794,-0.392761
1,-1.794071,-0.205150,-0.943991,-0.544680,-0.444571,-0.198315,-2.424866,-0.684253,1.049859,-0.032068,...,2.431666,-0.699047,-1.095301,-0.832000,-0.080621,0.054052,0.712911,-1.823570,-1.478565,-0.101165
2,-2.024714,-0.205150,0.150299,1.401948,0.973080,-0.858740,0.447692,1.942893,0.543645,-4.773193,...,2.760938,0.109103,0.360666,-0.752473,-0.407137,-0.271194,0.524487,0.250965,0.641962,0.310064
3,-0.390051,-1.338270,-0.060141,0.394344,-0.194190,0.063981,-1.687318,-1.098118,1.070521,0.143762,...,0.293883,0.143493,0.360666,-0.195788,-0.463947,-0.442543,-0.310978,-1.084408,-0.686137,-0.016347
4,-1.564282,-0.205150,-1.049211,-1.594488,-1.290930,1.661179,-0.406312,-0.633869,1.323629,-0.496761,...,-1.339430,-0.991356,-3.716043,-1.547737,1.602150,1.462129,0.609811,0.146367,0.478247,-0.232693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,-1.530454,0.172556,0.381784,-0.966713,-0.807800,-0.235786,0.797057,1.467848,-1.690415,0.834522,...,-0.556943,-0.441126,0.360666,0.679002,-0.751854,-0.576608,-0.158106,0.449702,1.028725,-0.284990
193,-1.825507,0.172556,-0.396846,-1.299064,-0.906542,-1.074198,0.719420,1.950091,-1.984846,1.010352,...,-0.741460,-0.870994,0.360666,0.679002,-0.727586,-0.561536,-0.111889,0.285832,-0.287298,-0.430713
194,-1.615023,0.927970,0.234476,-0.971989,-0.966492,-0.558973,0.797057,2.223602,-2.400666,1.054309,...,-0.912308,-0.578684,0.360666,0.679002,-0.793220,-0.607546,-0.670050,0.425296,0.753066,-0.367009
195,-1.992766,-0.582857,-0.165361,-1.030018,-0.977072,-1.434855,0.874694,2.119236,-2.039084,0.985233,...,-0.775629,-1.369639,0.360666,0.679002,-0.793220,-0.601993,-0.321644,0.320698,0.308336,-0.452352


In [None]:
# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)
df.to_excel("socioecon_AUTOENCODER_best_auto_config1_res_onedim_ss_1215.xlsx", index=False)

print("File 'socioecon_AUTOENCODER.xlsx' has been saved.")


File 'socioecon_AUTOENCODER.xlsx' has been saved.


In [None]:
mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)


In [None]:
mi_scores

[array([0.45192428, 0.27543286, 0.35131272, 0.66897774, 0.26529562,
        0.26748981, 0.46601931, 0.77323608, 1.23567364, 0.41755628,
        0.81497429, 0.60826916, 0.3553879 , 0.35670747, 0.66300461,
        0.70071738, 0.4564292 , 0.06154863, 0.14696738, 0.7589443 ,
        0.57723572, 0.1882379 , 0.28577697, 0.26066358])]

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 1)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

In [None]:
# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Final_Index
0,-1.599647,0.172556,-1.154431,-0.956162,-0.585631,0.832135,-3.356507,-0.410742,0.905227,0.011890,...,-1.017613,-0.148817,0.360666,0.042791,-0.379008,-0.068907,1.502159,-4.163087,-0.211794,-0.439530
1,-1.794071,-0.205150,-0.943991,-0.544680,-0.444571,-0.198315,-2.424866,-0.684253,1.049859,-0.032068,...,2.431666,-0.699047,-1.095301,-0.832000,-0.080621,0.054052,0.712911,-1.823570,-1.478565,-0.116126
2,-2.024714,-0.205150,0.150299,1.401948,0.973080,-0.858740,0.447692,1.942893,0.543645,-4.773193,...,2.760938,0.109103,0.360666,-0.752473,-0.407137,-0.271194,0.524487,0.250965,0.641962,0.290670
3,-0.390051,-1.338270,-0.060141,0.394344,-0.194190,0.063981,-1.687318,-1.098118,1.070521,0.143762,...,0.293883,0.143493,0.360666,-0.195788,-0.463947,-0.442543,-0.310978,-1.084408,-0.686137,-0.019724
4,-1.564282,-0.205150,-1.049211,-1.594488,-1.290930,1.661179,-0.406312,-0.633869,1.323629,-0.496761,...,-1.339430,-0.991356,-3.716043,-1.547737,1.602150,1.462129,0.609811,0.146367,0.478247,-0.222960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,-1.530454,0.172556,0.381784,-0.966713,-0.807800,-0.235786,0.797057,1.467848,-1.690415,0.834522,...,-0.556943,-0.441126,0.360666,0.679002,-0.751854,-0.576608,-0.158106,0.449702,1.028725,-0.270772
193,-1.825507,0.172556,-0.396846,-1.299064,-0.906542,-1.074198,0.719420,1.950091,-1.984846,1.010352,...,-0.741460,-0.870994,0.360666,0.679002,-0.727586,-0.561536,-0.111889,0.285832,-0.287298,-0.413305
194,-1.615023,0.927970,0.234476,-0.971989,-0.966492,-0.558973,0.797057,2.223602,-2.400666,1.054309,...,-0.912308,-0.578684,0.360666,0.679002,-0.793220,-0.607546,-0.670050,0.425296,0.753066,-0.357296
195,-1.992766,-0.582857,-0.165361,-1.030018,-0.977072,-1.434855,0.874694,2.119236,-2.039084,0.985233,...,-0.775629,-1.369639,0.360666,0.679002,-0.793220,-0.601993,-0.321644,0.320698,0.308336,-0.428658


In [None]:
# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)
df.to_excel("socioecon_AUTOENCODER_best_auto_config2_res_onedim_ss_1215.xlsx", index=False)

print("File 'socioecon_AUTOENCODER.xlsx' has been saved.")


File 'socioecon_AUTOENCODER.xlsx' has been saved.


## res / only one encoding dimension - minmax (method 2)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mutual_info_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from itertools import product
from sklearn.feature_selection import mutual_info_regression
from kerastuner import HyperModel, RandomSearch

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_residential.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [6, 8, 10, 12, 14, 16]
encoding_dims_options = [1]
results = []

# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 6, Encoding dimension: 1, Neurons: 12, Test loss: 0.03908932954072952


Hidden layers: 4, Encoding dimension: 1, Neurons: 14, Test loss: 0.03944898396730423


In [None]:
from kerastuner import Hyperband
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Choice('encoding_dim', [1]) # 1D encoding dimension
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)
    bottleneck = Dense(encoding_dim, activation=activation, name='bottleneck')(x)
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

In [None]:
from kerastuner.tuners import Hyperband

tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning_1_only1___minma__x', project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=2, hidden_layers_after=2),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning_2_only1___minma__x', project_name='model_config_2'
)

In [None]:
# Run the tuning for configuration 1
tuner_config_1.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 1
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]


Trial 90 Complete [00h 00m 12s]
val_loss: 0.04257829114794731

Best val_loss So Far: 0.024443302303552628
Total elapsed time: 00h 09m 28s


In [None]:
# Run the tuning for configuration 2
tuner_config_2.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 2
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

Trial 90 Complete [00h 00m 14s]
val_loss: 0.24080458283424377

Best val_loss So Far: 0.02636290155351162
Total elapsed time: 00h 10m 56s


#### check

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 1, 'neurons': 18, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0067'}

Test Loss: 0.024443302303552628
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step

Latent Feature Correlations:
1.0


In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 1, 'neurons': 12, 'learning_rate': 0.01, 'batch_size': 16, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0068'}

Test Loss: 0.02636290155351162
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step

Latent Feature Correlations:
1.0


#### past

In [None]:
# best model for configuration 1
print("Best Model Structure for Configuration 1:")
best_model_config_1.summary()

# best model for configuration 2
print("\nBest Model Structure for Configuration 2:")
best_model_config_2.summary()

Best Model Structure for Configuration 1:



Best Model Structure for Configuration 2:


In [None]:
# the best model in configuration 1
print("Best Model for Configuration 1:")
for layer in best_model_config_1.layers:
    if hasattr(layer, 'output_shape'):
        output_shape = layer.output_shape
    else:
        output_shape = layer.get_output_shape_at(0) if hasattr(layer, 'get_output_shape_at') else 'N/A'
    print(f"Layer: {layer.name}, Type: {layer.__class__.__name__}, Output Shape: {output_shape}, Parameters: {layer.count_params()}")

# best model in configuration 2
print("\nBest Model for Configuration 2:")
for layer in best_model_config_2.layers:
    if hasattr(layer, 'output_shape'):
        output_shape = layer.output_shape
    else:
        output_shape = layer.get_output_shape_at(0) if hasattr(layer, 'get_output_shape_at') else 'N/A'
    print(f"Layer: {layer.name}, Type: {layer.__class__.__name__}, Output Shape: {output_shape}, Parameters: {layer.count_params()}")

Best Model for Configuration 1:
Layer: input_layer, Type: InputLayer, Output Shape: N/A, Parameters: 0
Layer: dense_encoder_1, Type: Dense, Output Shape: N/A, Parameters: 450
Layer: dense_encoder_2, Type: Dense, Output Shape: N/A, Parameters: 342
Layer: dense_encoder_3, Type: Dense, Output Shape: N/A, Parameters: 342
Layer: bottleneck, Type: Dense, Output Shape: N/A, Parameters: 19
Layer: dense_decoder_1, Type: Dense, Output Shape: N/A, Parameters: 36
Layer: dense_decoder_2, Type: Dense, Output Shape: N/A, Parameters: 342
Layer: dense_decoder_3, Type: Dense, Output Shape: N/A, Parameters: 342
Layer: output_layer, Type: Dense, Output Shape: N/A, Parameters: 456

Best Model for Configuration 2:
Layer: input_layer, Type: InputLayer, Output Shape: N/A, Parameters: 0
Layer: dense_encoder_1, Type: Dense, Output Shape: N/A, Parameters: 300
Layer: dense_encoder_2, Type: Dense, Output Shape: N/A, Parameters: 156
Layer: bottleneck, Type: Dense, Output Shape: N/A, Parameters: 13
Layer: dense_deco

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)



[1m1/7[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 120ms/step



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


In [None]:
bottleneck_output_1.shape

(197, 1)

In [None]:
bottleneck_output_2.shape

(197, 1)

In [None]:
from sklearn.feature_selection import mutual_info_regression
import numpy as np
import pandas as pd

In [None]:
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)


In [None]:
mi_scores

[array([0.45808009, 0.16703029, 0.18016476, 0.52806386, 0.22005145,
        0.23444151, 0.43214195, 0.91923872, 1.48457639, 0.44000686,
        0.97619557, 0.75268892, 0.32334002, 0.37013004, 0.49771532,
        0.60622577, 0.41354815, 0.08761954, 0.15684266, 0.69981308,
        0.57170542, 0.07656843, 0.14610777, 0.1472826 ])]

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 1)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

In [None]:
# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Final_Index
0,0.131654,0.8750,0.120482,0.154095,0.182022,0.674157,0.192593,0.291705,0.877313,0.821121,...,0.090142,0.274691,1.000000,0.873016,0.097033,0.108458,0.846,0.409878,0.706241,0.402076
1,0.071436,0.8125,0.150602,0.238147,0.211985,0.449438,0.370370,0.222425,0.915696,0.813578,...,0.800026,0.175926,0.761905,0.698413,0.160974,0.132647,0.624,0.693119,0.472230,0.457849
2,0.000000,0.8125,0.307229,0.635776,0.513109,0.305414,0.918519,0.887876,0.781357,0.000000,...,0.867792,0.320988,1.000000,0.714286,0.091006,0.068664,0.571,0.944280,0.863955,0.516496
3,0.506297,0.6250,0.277108,0.429957,0.265169,0.506639,0.511111,0.117593,0.921179,0.843750,...,0.360056,0.327160,1.000000,0.825397,0.078832,0.034956,0.336,0.782609,0.618615,0.478956
4,0.142608,0.8125,0.135542,0.023707,0.032210,0.854954,0.755556,0.235187,0.988348,0.733836,...,0.023910,0.123457,0.333333,0.555556,0.521570,0.409644,0.595,0.931617,0.833712,0.436697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,0.153085,0.8750,0.340361,0.151940,0.134831,0.441267,0.985185,0.767548,0.188485,0.962284,...,0.184951,0.222222,1.000000,1.000000,0.017137,0.008583,0.379,0.968341,0.935401,0.413225
193,0.061700,0.8750,0.228916,0.084052,0.113858,0.258427,0.970370,0.889699,0.110350,0.992457,...,0.146976,0.145062,1.000000,1.000000,0.022338,0.011548,0.392,0.948501,0.692293,0.390596
194,0.126892,1.0000,0.319277,0.150862,0.101124,0.370787,0.985185,0.958979,0.000000,1.000000,...,0.111814,0.197531,1.000000,1.000000,0.008273,0.002497,0.235,0.965386,0.884479,0.392526
195,0.009895,0.7500,0.262048,0.139009,0.098876,0.179775,1.000000,0.932543,0.095956,0.988147,...,0.139944,0.055556,1.000000,1.000000,0.008273,0.003589,0.333,0.952723,0.802324,0.386895


In [None]:
# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)
df.to_excel("socioecon_AUTOENCODER_best_auto_config1_res_onedim_minmax_1214.xlsx", index=False)

print("File 'socioecon_AUTOENCODER.xlsx' has been saved.")


File 'socioecon_AUTOENCODER.xlsx' has been saved.


In [None]:
##Bottleneckoutptu2
# Assuming X and bottleneck_output are defined
mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)


In [None]:
mi_scores

[array([0.30704494, 0.24233307, 0.15630951, 0.39416776, 0.17011601,
        0.29315746, 0.44271335, 0.83639234, 1.49782745, 0.4992256 ,
        0.88166262, 0.67910652, 0.30196234, 0.3594874 , 0.51056676,
        0.50107978, 0.34023062, 0.10819891, 0.22398809, 0.78150078,
        0.67750546, 0.18444399, 0.0884831 , 0.12193709])]

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 1)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

In [None]:
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Final_Index
0,0.131654,0.8750,0.120482,0.154095,0.182022,0.674157,0.192593,0.291705,0.877313,0.821121,...,0.090142,0.274691,1.000000,0.873016,0.097033,0.108458,0.846,0.409878,0.706241,0.425867
1,0.071436,0.8125,0.150602,0.238147,0.211985,0.449438,0.370370,0.222425,0.915696,0.813578,...,0.800026,0.175926,0.761905,0.698413,0.160974,0.132647,0.624,0.693119,0.472230,0.470448
2,0.000000,0.8125,0.307229,0.635776,0.513109,0.305414,0.918519,0.887876,0.781357,0.000000,...,0.867792,0.320988,1.000000,0.714286,0.091006,0.068664,0.571,0.944280,0.863955,0.514145
3,0.506297,0.6250,0.277108,0.429957,0.265169,0.506639,0.511111,0.117593,0.921179,0.843750,...,0.360056,0.327160,1.000000,0.825397,0.078832,0.034956,0.336,0.782609,0.618615,0.481875
4,0.142608,0.8125,0.135542,0.023707,0.032210,0.854954,0.755556,0.235187,0.988348,0.733836,...,0.023910,0.123457,0.333333,0.555556,0.521570,0.409644,0.595,0.931617,0.833712,0.465191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,0.153085,0.8750,0.340361,0.151940,0.134831,0.441267,0.985185,0.767548,0.188485,0.962284,...,0.184951,0.222222,1.000000,1.000000,0.017137,0.008583,0.379,0.968341,0.935401,0.419950
193,0.061700,0.8750,0.228916,0.084052,0.113858,0.258427,0.970370,0.889699,0.110350,0.992457,...,0.146976,0.145062,1.000000,1.000000,0.022338,0.011548,0.392,0.948501,0.692293,0.399011
194,0.126892,1.0000,0.319277,0.150862,0.101124,0.370787,0.985185,0.958979,0.000000,1.000000,...,0.111814,0.197531,1.000000,1.000000,0.008273,0.002497,0.235,0.965386,0.884479,0.397563
195,0.009895,0.7500,0.262048,0.139009,0.098876,0.179775,1.000000,0.932543,0.095956,0.988147,...,0.139944,0.055556,1.000000,1.000000,0.008273,0.003589,0.333,0.952723,0.802324,0.393167


In [None]:
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)
df.to_excel("socioecon_AUTOENCODER_best_auto_config2_res_onedim_minmax_1214.xlsx", index=False)

print("File 'socioecon_AUTOENCODER.xlsx' has been saved.")


File 'socioecon_AUTOENCODER.xlsx' has been saved.


## mod / only one encoding dimension (method 2)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mutual_info_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from itertools import product
from sklearn.feature_selection import mutual_info_regression
from kerastuner import HyperModel, RandomSearch

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_mod.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = StandardScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [6, 8, 10, 12, 14, 16]
encoding_dims_options = [1]
results = []

# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 6, Encoding dimension: 1, Neurons: 16, Test loss: 0.31799641251564026


Hidden layers: 8, Encoding dimension: 1, Neurons: 16, Test loss: 0.33045417070388794


In [None]:
from kerastuner import Hyperband
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Choice('encoding_dim', [1]) # 1D encoding dimension
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)
    bottleneck = Dense(encoding_dim, activation=activation, name='bottleneck')(x)
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

In [None]:
from kerastuner.tuners import Hyperband

tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning_1__only1____mod', project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=4, hidden_layers_after=4),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning_2__only1____mod', project_name='model_config_2'
)

In [None]:
# Run the tuning for configuration 1
tuner_config_1.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 1
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]


Trial 90 Complete [00h 00m 10s]
val_loss: 1.1033384799957275

Best val_loss So Far: 0.21279674768447876
Total elapsed time: 00h 09m 50s


In [None]:
# Run the tuning for configuration 2
tuner_config_2.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 2
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

Trial 90 Complete [00h 00m 14s]
val_loss: 0.5080286860466003

Best val_loss So Far: 0.20329490303993225
Total elapsed time: 00h 13m 23s


#### check

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 1, 'neurons': 21, 'learning_rate': 0.01, 'batch_size': 64, 'activation': 'relu', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0046'}

Test Loss: 0.21279674768447876
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step

Latent Feature Correlations:
1.0


In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 1, 'neurons': 12, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0069'}

Test Loss: 0.20329490303993225
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step

Latent Feature Correlations:
1.0


In [None]:
# Display the structure of the best model for configuration 1
print("Best Model Structure for Configuration 1:")
best_model_config_1.summary()

# Display the structure of the best model for configuration 2
print("\nBest Model Structure for Configuration 2:")
best_model_config_2.summary()

Best Model Structure for Configuration 1:



Best Model Structure for Configuration 2:


#### past

In [None]:
# Print details of each layer for the best model in configuration 1
print("Best Model for Configuration 1:")
for layer in best_model_config_1.layers:
    if hasattr(layer, 'output_shape'):
        output_shape = layer.output_shape
    else:
        output_shape = layer.get_output_shape_at(0) if hasattr(layer, 'get_output_shape_at') else 'N/A'
    print(f"Layer: {layer.name}, Type: {layer.__class__.__name__}, Output Shape: {output_shape}, Parameters: {layer.count_params()}")

# Print details of each layer for the best model in configuration 2
print("\nBest Model for Configuration 2:")
for layer in best_model_config_2.layers:
    if hasattr(layer, 'output_shape'):
        output_shape = layer.output_shape
    else:
        output_shape = layer.get_output_shape_at(0) if hasattr(layer, 'get_output_shape_at') else 'N/A'
    print(f"Layer: {layer.name}, Type: {layer.__class__.__name__}, Output Shape: {output_shape}, Parameters: {layer.count_params()}")

Best Model for Configuration 1:
Layer: input_layer, Type: InputLayer, Output Shape: N/A, Parameters: 0
Layer: dense_encoder_1, Type: Dense, Output Shape: N/A, Parameters: 525
Layer: dense_encoder_2, Type: Dense, Output Shape: N/A, Parameters: 462
Layer: dense_encoder_3, Type: Dense, Output Shape: N/A, Parameters: 462
Layer: bottleneck, Type: Dense, Output Shape: N/A, Parameters: 22
Layer: dense_decoder_1, Type: Dense, Output Shape: N/A, Parameters: 42
Layer: dense_decoder_2, Type: Dense, Output Shape: N/A, Parameters: 462
Layer: dense_decoder_3, Type: Dense, Output Shape: N/A, Parameters: 462
Layer: output_layer, Type: Dense, Output Shape: N/A, Parameters: 528

Best Model for Configuration 2:
Layer: input_layer, Type: InputLayer, Output Shape: N/A, Parameters: 0
Layer: dense_encoder_1, Type: Dense, Output Shape: N/A, Parameters: 300
Layer: dense_encoder_2, Type: Dense, Output Shape: N/A, Parameters: 156
Layer: dense_encoder_3, Type: Dense, Output Shape: N/A, Parameters: 156
Layer: dens

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)



[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [None]:
bottleneck_output_1.shape

(262, 1)

In [None]:
bottleneck_output_2.shape

(262, 1)

In [None]:
from sklearn.feature_selection import mutual_info_regression
import numpy as np
import pandas as pd

In [None]:
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)


In [None]:
mi_scores

[array([0.84952326, 0.73110931, 0.72687123, 1.14945   , 0.73138349,
        0.79717025, 0.71058062, 0.88761242, 1.13779121, 0.86185777,
        0.99076894, 0.41458153, 0.70845884, 0.78328307, 1.164205  ,
        1.12770526, 0.87781595, 0.60395874, 0.63380565, 0.89272986,
        0.73549511, 0.54699236, 0.60108138, 0.6055076 ])]

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 1)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

In [None]:
# Export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Final_Index
0,-0.698641,0.607407,-0.385408,-0.466819,-0.188799,1.028644,-1.095383,-0.021052,1.110346,0.534798,...,-0.612939,0.339178,0.679367,0.541898,-0.183930,0.093623,1.730565,-0.678899,0.405460,0.110698
1,-0.838198,0.448266,-0.233494,-0.105609,-0.059436,0.330437,-0.646493,-0.269981,1.217350,0.515143,...,2.607521,-0.059374,0.126247,0.117625,0.133974,0.229106,1.002373,0.014575,-0.245333,0.320261
2,-1.003754,0.448266,0.556455,1.603193,1.240668,-0.117050,0.737585,2.121046,0.842837,-1.604736,...,2.914949,0.526000,0.679367,0.156195,-0.213899,-0.129267,0.828525,0.629504,0.844069,0.706837
3,0.169605,-0.029156,0.404542,0.718691,0.170185,0.508162,-0.291122,-0.646650,1.232636,0.593761,...,0.611555,0.550909,0.679367,0.426187,-0.274424,-0.318069,0.057691,0.233676,0.161770,0.389243
4,-0.673256,0.448266,-0.309451,-1.027158,-0.835617,1.590383,0.326102,-0.224126,1.419893,0.307367,...,-0.913407,-0.271105,-0.869370,-0.229508,1.926811,1.780599,0.907248,0.598499,0.759962,0.126579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,-1.362580,-1.620562,-1.357653,-1.129038,-1.055535,-1.408733,-1.581681,-1.069174,-1.335454,-1.604736,...,-1.021877,-1.354669,-1.643739,-1.579469,-0.666369,-0.513863,-1.044439,-1.682421,-1.558624,-1.250455
258,-1.362580,-1.620562,-1.357653,-1.129038,-1.055535,-1.408733,-1.581681,-1.069174,-1.335454,-1.604736,...,-1.021877,-1.354669,-1.643739,-1.579469,-0.666369,-0.513863,-1.044439,-1.682421,-1.558624,-1.250455
259,-1.362580,-1.620562,-1.357653,-1.129038,-1.055535,-1.408733,-1.581681,-1.069174,-1.335454,-1.604736,...,-1.021877,-1.354669,-1.643739,-1.579469,-0.666369,-0.513863,-1.044439,-1.682421,-1.558624,-1.250455
260,-1.362580,-1.620562,-1.357653,-1.129038,-1.055535,-1.408733,-1.581681,-1.069174,-1.335454,-1.604736,...,-1.021877,-1.354669,-1.643739,-1.579469,-0.666369,-0.513863,-1.044439,-1.682421,-1.558624,-1.250455


In [None]:
# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)
df.to_excel("socioecon_AUTOENCODER_best_auto_config1_mod_onedim_ss_1214.xlsx", index=False)

print("File 'socioecon_AUTOENCODER.xlsx' has been saved.")


File 'socioecon_AUTOENCODER.xlsx' has been saved.


In [None]:
mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)


In [None]:
mi_scores

[array([0.80419601, 0.75870985, 0.70746058, 1.00504558, 0.68500398,
        0.76005704, 0.79645461, 1.04300549, 1.34467144, 0.89908894,
        1.03654113, 0.49677047, 0.85969651, 0.76210342, 1.02600914,
        1.04302316, 0.85520862, 0.63570287, 0.75063914, 1.04950837,
        0.86517841, 0.54550157, 0.71169999, 0.71938289])]

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 1)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

In [None]:
# Export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Final_Index
0,-0.698641,0.607407,-0.385408,-0.466819,-0.188799,1.028644,-1.095383,-0.021052,1.110346,0.534798,...,-0.612939,0.339178,0.679367,0.541898,-0.183930,0.093623,1.730565,-0.678899,0.405460,0.129234
1,-0.838198,0.448266,-0.233494,-0.105609,-0.059436,0.330437,-0.646493,-0.269981,1.217350,0.515143,...,2.607521,-0.059374,0.126247,0.117625,0.133974,0.229106,1.002373,0.014575,-0.245333,0.319165
2,-1.003754,0.448266,0.556455,1.603193,1.240668,-0.117050,0.737585,2.121046,0.842837,-1.604736,...,2.914949,0.526000,0.679367,0.156195,-0.213899,-0.129267,0.828525,0.629504,0.844069,0.655169
3,0.169605,-0.029156,0.404542,0.718691,0.170185,0.508162,-0.291122,-0.646650,1.232636,0.593761,...,0.611555,0.550909,0.679367,0.426187,-0.274424,-0.318069,0.057691,0.233676,0.161770,0.372522
4,-0.673256,0.448266,-0.309451,-1.027158,-0.835617,1.590383,0.326102,-0.224126,1.419893,0.307367,...,-0.913407,-0.271105,-0.869370,-0.229508,1.926811,1.780599,0.907248,0.598499,0.759962,0.181098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,-1.362580,-1.620562,-1.357653,-1.129038,-1.055535,-1.408733,-1.581681,-1.069174,-1.335454,-1.604736,...,-1.021877,-1.354669,-1.643739,-1.579469,-0.666369,-0.513863,-1.044439,-1.682421,-1.558624,-1.245670
258,-1.362580,-1.620562,-1.357653,-1.129038,-1.055535,-1.408733,-1.581681,-1.069174,-1.335454,-1.604736,...,-1.021877,-1.354669,-1.643739,-1.579469,-0.666369,-0.513863,-1.044439,-1.682421,-1.558624,-1.245670
259,-1.362580,-1.620562,-1.357653,-1.129038,-1.055535,-1.408733,-1.581681,-1.069174,-1.335454,-1.604736,...,-1.021877,-1.354669,-1.643739,-1.579469,-0.666369,-0.513863,-1.044439,-1.682421,-1.558624,-1.245670
260,-1.362580,-1.620562,-1.357653,-1.129038,-1.055535,-1.408733,-1.581681,-1.069174,-1.335454,-1.604736,...,-1.021877,-1.354669,-1.643739,-1.579469,-0.666369,-0.513863,-1.044439,-1.682421,-1.558624,-1.245670


In [None]:
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)
df.to_excel("socioecon_AUTOENCODER_best_auto_config2_mod_onedim_ss_1214.xlsx", index=False)

print("File 'socioecon_AUTOENCODER.xlsx' has been saved.")


File 'socioecon_AUTOENCODER.xlsx' has been saved.


## mod / only one encoding dimension - minmax (method 2)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mutual_info_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from itertools import product
from sklearn.feature_selection import mutual_info_regression
from kerastuner import HyperModel, RandomSearch

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_mod.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [6, 8, 10, 12, 14, 16]
encoding_dims_options = [1]
results = []


# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 6, Encoding dimension: 1, Neurons: 14, Test loss: 0.024151155725121498


Hidden layers: 8, Encoding dimension: 1, Neurons: 10, Test loss: 0.02516746148467064


In [None]:
from kerastuner import Hyperband
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Choice('encoding_dim', [1]) # 1D encoding dimension
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)
    bottleneck = Dense(encoding_dim, activation=activation, name='bottleneck')(x)
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

In [None]:
from kerastuner.tuners import Hyperband

tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning_1_only1__modm___m_mx', project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=4, hidden_layers_after=4),
    objective='val_loss', max_epochs=50, factor=3, directory='hyperparam_tuning_2_only1__modm___m_mx', project_name='model_config_2'
)

In [None]:
# Run the tuning for configuration 1
tuner_config_1.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 1
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]


Trial 90 Complete [00h 00m 20s]
val_loss: 0.11143441498279572

Best val_loss So Far: 0.019558779895305634
Total elapsed time: 00h 24m 13s


In [None]:
# Run the tuning for configuration 2
tuner_config_2.search(X_train, X_train, epochs=50, validation_data=(X_test, X_test))

# Get the best model for configuration 2
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

Trial 90 Complete [00h 00m 26s]
val_loss: 0.2485501766204834

Best val_loss So Far: 0.014420676045119762
Total elapsed time: 00h 27m 24s


#### check

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 1, 'neurons': 6, 'learning_rate': 0.01, 'batch_size': 16, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0074'}

Test Loss: 0.019558779895305634
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step

Latent Feature Correlations:
1.0


In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 1, 'neurons': 9, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0047'}

Test Loss: 0.014420676045119762
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step

Latent Feature Correlations:
1.0


In [None]:
# Display the structure of the best model for configuration 1
print("Best Model Structure for Configuration 1:")
best_model_config_1.summary()

# Display the structure of the best model for configuration 2
print("\nBest Model Structure for Configuration 2:")
best_model_config_2.summary()

Best Model Structure for Configuration 1:



Best Model Structure for Configuration 2:


In [None]:
# Print details of each layer for the best model in configuration 1
print("Best Model for Configuration 1:")
for layer in best_model_config_1.layers:
    if hasattr(layer, 'output_shape'):
        output_shape = layer.output_shape
    else:
        output_shape = layer.get_output_shape_at(0) if hasattr(layer, 'get_output_shape_at') else 'N/A'
    print(f"Layer: {layer.name}, Type: {layer.__class__.__name__}, Output Shape: {output_shape}, Parameters: {layer.count_params()}")

# Print details of each layer for the best model in configuration 2
print("\nBest Model for Configuration 2:")
for layer in best_model_config_2.layers:
    if hasattr(layer, 'output_shape'):
        output_shape = layer.output_shape
    else:
        output_shape = layer.get_output_shape_at(0) if hasattr(layer, 'get_output_shape_at') else 'N/A'
    print(f"Layer: {layer.name}, Type: {layer.__class__.__name__}, Output Shape: {output_shape}, Parameters: {layer.count_params()}")

Best Model for Configuration 1:
Layer: input_layer, Type: InputLayer, Output Shape: N/A, Parameters: 0
Layer: dense_encoder_1, Type: Dense, Output Shape: N/A, Parameters: 150
Layer: dense_encoder_2, Type: Dense, Output Shape: N/A, Parameters: 42
Layer: dense_encoder_3, Type: Dense, Output Shape: N/A, Parameters: 42
Layer: bottleneck, Type: Dense, Output Shape: N/A, Parameters: 7
Layer: dense_decoder_1, Type: Dense, Output Shape: N/A, Parameters: 12
Layer: dense_decoder_2, Type: Dense, Output Shape: N/A, Parameters: 42
Layer: dense_decoder_3, Type: Dense, Output Shape: N/A, Parameters: 42
Layer: output_layer, Type: Dense, Output Shape: N/A, Parameters: 168

Best Model for Configuration 2:
Layer: input_layer, Type: InputLayer, Output Shape: N/A, Parameters: 0
Layer: dense_encoder_1, Type: Dense, Output Shape: N/A, Parameters: 225
Layer: dense_encoder_2, Type: Dense, Output Shape: N/A, Parameters: 90
Layer: dense_encoder_3, Type: Dense, Output Shape: N/A, Parameters: 90
Layer: dense_encod

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


In [None]:
bottleneck_output_1.shape

(262, 1)

In [None]:
bottleneck_output_2.shape

(262, 1)

In [None]:
from sklearn.feature_selection import mutual_info_regression
import numpy as np
import pandas as pd

In [None]:
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)


In [None]:
mi_scores

[array([0.62668436, 0.72630959, 0.6436562 , 0.78745763, 0.66476808,
        0.77796662, 0.85742204, 0.92875388, 1.41374823, 0.87191872,
        1.20623266, 0.61214267, 0.658503  , 0.77998096, 0.74869513,
        0.74466675, 0.76392034, 0.63487119, 0.72160316, 1.03331348,
        1.01346991, 0.53937368, 0.66744976, 0.67125691])]

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 1)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

In [None]:
# Export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Final_Index
0,0.248076,0.8750,0.179775,0.154095,0.197059,0.706532,0.192593,0.291705,0.877313,0.821121,...,0.090142,0.366577,1.000000,0.873016,0.097033,0.108458,0.846,0.409878,0.706241,0.458588
1,0.195931,0.8125,0.207865,0.238147,0.226471,0.504140,0.370370,0.222425,0.915696,0.813578,...,0.800026,0.280323,0.761905,0.698413,0.160974,0.132647,0.624,0.693119,0.472230,0.481695
2,0.134073,0.8125,0.353933,0.635776,0.522059,0.374425,0.918519,0.887876,0.781357,0.000000,...,0.867792,0.407008,1.000000,0.714286,0.091006,0.068664,0.571,0.944280,0.863955,0.553711
3,0.572489,0.6250,0.325843,0.429957,0.278676,0.555658,0.511111,0.117593,0.921179,0.843750,...,0.360056,0.412399,1.000000,0.825397,0.078832,0.034956,0.336,0.782609,0.618615,0.511829
4,0.257560,0.8125,0.193820,0.023707,0.050000,0.869365,0.755556,0.235187,0.988348,0.733836,...,0.023910,0.234501,0.333333,0.555556,0.521570,0.409644,0.595,0.931617,0.833712,0.481893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000
258,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000
259,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000
260,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000


In [None]:
# Export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)
df.to_excel("socioecon_AUTOENCODER_best_auto_config1_mod_onedim_minmax_1215.xlsx", index=False)

print("File 'socioecon_AUTOENCODER.xlsx' has been saved.")


File 'socioecon_AUTOENCODER.xlsx' has been saved.


In [None]:
mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)


In [None]:
mi_scores

[array([0.77212208, 0.69667059, 0.65517043, 0.87591643, 0.72094857,
        0.79021404, 0.83306905, 1.15837827, 1.69266304, 0.86424479,
        1.22445411, 0.6276208 , 0.72470217, 0.82754034, 0.95700406,
        0.94438236, 0.86803076, 0.63589015, 0.68953472, 1.07586588,
        0.91541699, 0.5601476 , 0.66470421, 0.6598256 ])]

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 1)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

In [None]:
# Export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
df

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Final_Index
0,0.248076,0.8750,0.179775,0.154095,0.197059,0.706532,0.192593,0.291705,0.877313,0.821121,...,0.090142,0.366577,1.000000,0.873016,0.097033,0.108458,0.846,0.409878,0.706241,0.452091
1,0.195931,0.8125,0.207865,0.238147,0.226471,0.504140,0.370370,0.222425,0.915696,0.813578,...,0.800026,0.280323,0.761905,0.698413,0.160974,0.132647,0.624,0.693119,0.472230,0.481493
2,0.134073,0.8125,0.353933,0.635776,0.522059,0.374425,0.918519,0.887876,0.781357,0.000000,...,0.867792,0.407008,1.000000,0.714286,0.091006,0.068664,0.571,0.944280,0.863955,0.563761
3,0.572489,0.6250,0.325843,0.429957,0.278676,0.555658,0.511111,0.117593,0.921179,0.843750,...,0.360056,0.412399,1.000000,0.825397,0.078832,0.034956,0.336,0.782609,0.618615,0.510631
4,0.257560,0.8125,0.193820,0.023707,0.050000,0.869365,0.755556,0.235187,0.988348,0.733836,...,0.023910,0.234501,0.333333,0.555556,0.521570,0.409644,0.595,0.931617,0.833712,0.470268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000
258,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000
259,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000
260,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000


In [None]:
# Export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)
df.to_excel("socioecon_AUTOENCODER_best_auto_config2_mod_onedim_minmax_1215.xlsx", index=False)

print("File 'socioecon_AUTOENCODER.xlsx' has been saved.")


File 'socioecon_AUTOENCODER.xlsx' has been saved.


## mod - different numbers of nerons with regularization techiniques (alpha tuned) - corr standard (method 3)

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_mod.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = StandardScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [8, 12, 16]
encoding_dims_options = [1, 2, 3, 4, 5]
results = []

# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 6, Encoding dimension: 4, Neurons: 16, Test loss: 0.17452606558799744


Hidden layers: 8, Encoding dimension: 2, Neurons: 16, Test loss: 0.17812329530715942


 add orthogonal regularization (w/ alpha tuned)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

# Custom layer for orthogonal regularization
class OrthogonalRegularization(Layer):
    def __init__(self, alpha=1e-2, **kwargs):
        super(OrthogonalRegularization, self).__init__(**kwargs)
        self.alpha = alpha

    def call(self, bottleneck_output):
        # Get the batch size
        batch_size = tf.cast(tf.shape(bottleneck_output)[0], tf.float32)

        # Normalize the bottleneck outputs
        normalized_output = bottleneck_output / tf.sqrt(batch_size)

        # Compute correlation matrix
        correlation = tf.matmul(
            tf.transpose(normalized_output),
            normalized_output
        )

        # Create identity matrix of the correct shape
        shape = tf.shape(correlation)[0]
        identity = tf.eye(shape)

        # Calculate loss (excluding diagonal elements)
        mask = tf.ones_like(correlation) - tf.eye(shape)
        loss = tf.reduce_sum(tf.square(correlation * mask))

        # Add loss to the layer
        self.add_loss(self.alpha * loss)

        return bottleneck_output


In [None]:
def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Int('encoding_dim', min_value=2, max_value=5, step=1)
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # alpha tuned
    alpha = hp.Choice('alpha', [1e-3, 1e-2, 1e-1, 1.0])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer

    # Encoder layers
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer with orthogonal regularization
    bottleneck = Dense(
        encoding_dim,
        activation=activation,
        kernel_constraint=tf.keras.constraints.UnitNorm(axis=0),
        name='bottleneck'
    )(x)

    # Apply orthogonal regularization
    bottleneck = OrthogonalRegularization(alpha=alpha)(bottleneck) #modified alpha for tuning

    # Decoder layers
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)

    output_layer = Dense(input_dim, name='output_layer')(x)

    # Create and compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='mse'  # Using standard MSE loss now
    )

    return model

In [None]:
from kerastuner.tuners import Hyperband
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Define callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        min_delta=1e-4
    )
]


In [None]:
# Define search space for each configuration
tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='hyperpara_tuning_1_ortho_al',
    project_name='model_config_1'
)

# Define search space for each configuration
tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=4, hidden_layers_after=4),
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='hyperpara_tuning_2_ortho_al',
    project_name='model_config_2',
    overwrite=True
)

In [None]:
# Run the tuning
tuner_config_1.search(
    X_train,
    X_train,
    epochs=50,
    validation_data=(X_test, X_test),
    callbacks=callbacks,
    batch_size=32,
    verbose=1
)

Trial 90 Complete [00h 00m 21s]
val_loss: 0.6056467294692993

Best val_loss So Far: 0.12343984097242355
Total elapsed time: 00h 14m 50s


In [None]:
# Run the tuning
tuner_config_2.search(
    X_train,
    X_train,
    epochs=50,
    validation_data=(X_test, X_test),
    callbacks=callbacks,
    batch_size=32,
    verbose=1
)

Trial 90 Complete [00h 00m 23s]
val_loss: 0.1205180212855339

Best val_loss So Far: 0.1205180212855339
Total elapsed time: 00h 21m 10s


#### regularization

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 15, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'alpha': 0.001, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0047'}

Test Loss: 0.12343984097242355
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step

Latent Feature Correlations:
[[ 1.          0.01563735  0.08845119  0.47797774  0.23262205]
 [ 0.01563735  1.          0.00298947  0.36033671 -0.02047094]
 [ 0.08845119  0.00298947  1.          0.46411566  0.40291995]
 [ 0.47797774  0.36033671  0.46411566  1.          0.39900081]
 [ 0.23262205 -0.02047094  0.40291995  0.39900081  1.        ]]


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_1.input, outputs=best_model_config_1.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_1.layers if 'dense_decoder' in l.name]:
    x = best_model_config_1.get_layer(layer_name)(x)
decoder_output = best_model_config_1.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#calculate bottleneck contributions
def calculate_bottleneck_contributions(autoencoder, encoder, decoder, X):
    bottleneck_output = encoder.predict(X)
    contributions = []

    for dim in range(bottleneck_output.shape[1]):
        # Isolate one dimension at a time
        isolated_latent = np.zeros_like(bottleneck_output)
        isolated_latent[:, dim] = bottleneck_output[:, dim]

        # Reconstruct input using only the isolated latent dimension
        reconstructed = decoder.predict(isolated_latent)

        # Compute reconstruction loss (e.g., MSE)
        loss = mean_squared_error(X, reconstructed)
        contributions.append(loss)

    # Normalize contributions
    contributions = np.array(contributions)
    normalized_contributions = 1 - (contributions / np.sum(contributions))

    return normalized_contributions

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7715
Latent Dimension 2: 0.8025
Latent Dimension 3: 0.8015
Latent Dimension 4: 0.8500
Latent Dimension 5: 0.7745


In [None]:
# Normalize the contributions so they sum to 1
def normalize_contributions(contributions):
    total = sum(contributions)
    return [c / total for c in contributions]

# Calculate contributions
#contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.1929
Latent Dimension 2: 0.2006
Latent Dimension 3: 0.2004
Latent Dimension 4: 0.2125
Latent Dimension 5: 0.1936
Sum of Normalized Contributions: 1.0


In [None]:
normalized_contributions

[0.19287748087616946,
 0.2006155725229849,
 0.20037139989913172,
 0.21249984188748264,
 0.19363570481423134]

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

# Calculate MI scores
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
latent_contributions

array([0.19287748, 0.20061557, 0.2003714 , 0.21249984, 0.1936357 ])

In [None]:
normalized_mi_scores

array([[0.04117537, 0.03661376, 0.04762047, 0.05211858, 0.04430519],
       [0.0374096 , 0.03951889, 0.03817171, 0.04215191, 0.04252005],
       [0.0376609 , 0.03813338, 0.04456891, 0.04262956, 0.04319346],
       [0.0388838 , 0.04136329, 0.06207811, 0.04506302, 0.0465928 ],
       [0.03731962, 0.03425157, 0.04977827, 0.04231396, 0.07552758],
       [0.03806196, 0.0471533 , 0.03949431, 0.04218331, 0.04390402],
       [0.03983413, 0.03670959, 0.04209999, 0.04360557, 0.03874607],
       [0.07326825, 0.03882458, 0.03642964, 0.04425371, 0.03839183],
       [0.07412343, 0.04385386, 0.0541308 , 0.04619805, 0.03857748],
       [0.04199613, 0.05598852, 0.0491157 , 0.04860385, 0.04425008],
       [0.04486089, 0.05391211, 0.05265754, 0.0432962 , 0.04089556],
       [0.01497383, 0.01761521, 0.0171469 , 0.01323896, 0.01853321],
       [0.04410983, 0.03629658, 0.0332491 , 0.04987715, 0.03840498],
       [0.04115252, 0.04092669, 0.03956933, 0.0399911 , 0.0433269 ],
       [0.05142686, 0.04841149, 0.

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_se_AUTOENCODER_best_auto_config1_1207_with_weights_mod_ss.xlsx", index=False)

config 2

In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 15, 'learning_rate': 0.01, 'batch_size': 64, 'activation': 'tanh', 'alpha': 0.1, 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

Test Loss: 0.1205180212855339
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step

Latent Feature Correlations:
[[ 1.         -0.47074008 -0.08920948 -0.08574593 -0.23235335]
 [-0.47074008  1.         -0.28753228 -0.02662083  0.47915895]
 [-0.08920948 -0.28753228  1.         -0.26569227 -0.06627331]
 [-0.08574593 -0.02662083 -0.26569227  1.          0.02860696]
 [-0.23235335  0.47915895 -0.06627331  0.02860696  1.        ]]


In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_2.input, outputs=best_model_config_2.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_2.layers if 'dense_decoder' in l.name]:
    x = best_model_config_2.get_layer(layer_name)(x)
decoder_output = best_model_config_2.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_2, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 101ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.8620
Latent Dimension 2: 0.8399
Latent Dimension 3: 0.7687
Latent Dimension 4: 0.7589
Latent Dimension 5: 0.7705


In [None]:
# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.2155
Latent Dimension 2: 0.2100
Latent Dimension 3: 0.1922
Latent Dimension 4: 0.1897
Latent Dimension 5: 0.1926
Sum of Normalized Contributions: 1.0


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)  # Make sure this is normalized

mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_2.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_2.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_se_AUTOENCODER_best_auto_config2_1207_with_weights_mod_ss.xlsx", index=False)

In [None]:
best_model_config_1.summary()

In [None]:
best_model_config_2.summary()

## mod - different numbers of nerons with regularization techiniques (alpha tuned) - corr MinMax (method 3)

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_mod.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [8, 12, 16]
encoding_dims_options = [1, 2, 3, 4, 5]
results = []

# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 6, Encoding dimension: 5, Neurons: 16, Test loss: 0.01743723452091217


Hidden layers: 8, Encoding dimension: 3, Neurons: 16, Test loss: 0.01776418834924698


 add orthogonal regularization (w/ alpha tuned)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

# Custom layer for orthogonal regularization
class OrthogonalRegularization(Layer):
    def __init__(self, alpha=1e-2, **kwargs):
        super(OrthogonalRegularization, self).__init__(**kwargs)
        self.alpha = alpha

    def call(self, bottleneck_output):
        # Get the batch size
        batch_size = tf.cast(tf.shape(bottleneck_output)[0], tf.float32)

        # Normalize the bottleneck outputs
        normalized_output = bottleneck_output / tf.sqrt(batch_size)

        # Compute correlation matrix
        correlation = tf.matmul(
            tf.transpose(normalized_output),
            normalized_output
        )

        # Create identity matrix of the correct shape
        shape = tf.shape(correlation)[0]
        identity = tf.eye(shape)

        # Calculate loss (excluding diagonal elements)
        mask = tf.ones_like(correlation) - tf.eye(shape)
        loss = tf.reduce_sum(tf.square(correlation * mask))

        # Add loss to the layer
        self.add_loss(self.alpha * loss)

        return bottleneck_output


In [None]:
def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Int('encoding_dim', min_value=2, max_value=5, step=1)
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # alpha tuned
    alpha = hp.Choice('alpha', [1e-3, 1e-2, 1e-1, 1.0])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer

    # Encoder layers
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer with orthogonal regularization
    bottleneck = Dense(
        encoding_dim,
        activation=activation,
        kernel_constraint=tf.keras.constraints.UnitNorm(axis=0),
        name='bottleneck'
    )(x)

    # Apply orthogonal regularization
    bottleneck = OrthogonalRegularization(alpha=alpha)(bottleneck) #modified alpha for tunin

    # Decoder layers
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)

    output_layer = Dense(input_dim, name='output_layer')(x)

    # Create and compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='mse'  # Using standard MSE loss now
    )

    return model

In [None]:
from kerastuner.tuners import Hyperband
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Define callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        min_delta=1e-4
    )
]


In [None]:
# Define search space for each configuration
tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='hyperpara_tuning_1_ortho_alp',
    project_name='model_config_1'
)

# Define search space for each configuration
tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=4, hidden_layers_after=4),
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='hyperpara_tuning_2_ortho_alp',
    project_name='model_config_2',
    overwrite=True
)

In [None]:
# Run the tuning
tuner_config_1.search(
    X_train,
    X_train,
    epochs=50,
    validation_data=(X_test, X_test),
    callbacks=callbacks,
    batch_size=32,
    verbose=1
)

Trial 90 Complete [00h 00m 27s]
val_loss: 0.028448501601815224

Best val_loss So Far: 0.009290081448853016
Total elapsed time: 00h 27m 32s


In [None]:
# Run the tuning
tuner_config_2.search(
    X_train,
    X_train,
    epochs=50,
    validation_data=(X_test, X_test),
    callbacks=callbacks,
    batch_size=32,
    verbose=1
)

Trial 90 Complete [00h 00m 29s]
val_loss: 0.009475469589233398

Best val_loss So Far: 0.009475469589233398
Total elapsed time: 00h 32m 58s


#### regularization

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 18, 'learning_rate': 0.01, 'batch_size': 64, 'activation': 'tanh', 'alpha': 0.01, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0076'}

Test Loss: 0.009290081448853016
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step

Latent Feature Correlations:
[[ 1.         -0.21559774 -0.37336696  0.36021264 -0.23835218]
 [-0.21559774  1.          0.17705809  0.10282089 -0.42454685]
 [-0.37336696  0.17705809  1.          0.01535451  0.15280912]
 [ 0.36021264  0.10282089  0.01535451  1.         -0.25548381]
 [-0.23835218 -0.42454685  0.15280912 -0.25548381  1.        ]]


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_1.input, outputs=best_model_config_1.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_1.layers if 'dense_decoder' in l.name]:
    x = best_model_config_1.get_layer(layer_name)(x)
decoder_output = best_model_config_1.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#calculate bottleneck contributions
def calculate_bottleneck_contributions(autoencoder, encoder, decoder, X):
    bottleneck_output = encoder.predict(X)
    contributions = []

    for dim in range(bottleneck_output.shape[1]):
        # Isolate one dimension at a time
        isolated_latent = np.zeros_like(bottleneck_output)
        isolated_latent[:, dim] = bottleneck_output[:, dim]

        # Reconstruct input using only the isolated latent dimension
        reconstructed = decoder.predict(isolated_latent)

        # Compute reconstruction loss (e.g., MSE)
        loss = mean_squared_error(X, reconstructed)
        contributions.append(loss)

    # Normalize contributions
    contributions = np.array(contributions)
    normalized_contributions = 1 - (contributions / np.sum(contributions))

    return normalized_contributions

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7589
Latent Dimension 2: 0.7655
Latent Dimension 3: 0.9235
Latent Dimension 4: 0.7744
Latent Dimension 5: 0.7777


In [None]:
# Normalize the contributions so they sum to 1
def normalize_contributions(contributions):
    total = sum(contributions)
    return [c / total for c in contributions]

# Calculate contributions
#contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.1897
Latent Dimension 2: 0.1914
Latent Dimension 3: 0.2309
Latent Dimension 4: 0.1936
Latent Dimension 5: 0.1944
Sum of Normalized Contributions: 1.0


In [None]:
normalized_contributions

[0.1897325750513584,
 0.19138099932297334,
 0.23086371752359017,
 0.19359327271286708,
 0.194429435389211]

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

# Calculate MI scores
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
latent_contributions

array([0.18973258, 0.191381  , 0.23086372, 0.19359327, 0.19442944])

In [None]:
normalized_mi_scores

array([[0.04403813, 0.05912023, 0.03839699, 0.04514464, 0.04380568],
       [0.04136017, 0.03875533, 0.03831339, 0.04681214, 0.03465181],
       [0.04543249, 0.04364298, 0.0355126 , 0.03563134, 0.04264342],
       [0.04026794, 0.05259841, 0.06006437, 0.03846253, 0.04155085],
       [0.04861087, 0.04362248, 0.04052443, 0.03941416, 0.06098905],
       [0.04024098, 0.04166316, 0.03992578, 0.04416311, 0.04214621],
       [0.04879535, 0.04532305, 0.0370296 , 0.04556244, 0.04750156],
       [0.04389576, 0.04184179, 0.04172719, 0.06047009, 0.03881507],
       [0.05056276, 0.05635183, 0.0457511 , 0.05823634, 0.05125694],
       [0.04952685, 0.04300422, 0.03962841, 0.04086586, 0.04700478],
       [0.04580567, 0.05581204, 0.04118017, 0.05232595, 0.04757477],
       [0.02835891, 0.01857804, 0.01302364, 0.0182495 , 0.01097345],
       [0.03890976, 0.03966699, 0.04982663, 0.03954378, 0.03289627],
       [0.04352458, 0.03814445, 0.03879363, 0.04595992, 0.04226888],
       [0.03653422, 0.03912507, 0.

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_se_AUTOENCODER_best_auto_config1_1208_with_weights_mod_mm.xlsx", index=False)

config 2

In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 4, 'neurons': 21, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'alpha': 0.001, 'tuner/epochs': 50, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

Test Loss: 0.009475469589233398
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 229ms/step

Latent Feature Correlations:
[[ 1.         -0.29614563  0.59267155 -0.29757324]
 [-0.29614563  1.         -0.02364822 -0.20476406]
 [ 0.59267155 -0.02364822  1.         -0.42211233]
 [-0.29757324 -0.20476406 -0.42211233  1.        ]]


In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_2.input, outputs=best_model_config_2.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_2.layers if 'dense_decoder' in l.name]:
    x = best_model_config_2.get_layer(layer_name)(x)
decoder_output = best_model_config_2.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_2, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step  
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7162
Latent Dimension 2: 0.6938
Latent Dimension 3: 0.8896
Latent Dimension 4: 0.7003


In [None]:
# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.2387
Latent Dimension 2: 0.2313
Latent Dimension 3: 0.2965
Latent Dimension 4: 0.2334
Sum of Normalized Contributions: 1.0


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)  # Make sure this is normalized

mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 4)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_2.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_2.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_se_AUTOENCODER_best_auto_config2_1208_with_weights_mod_mm.xlsx", index=False)

In [None]:
best_model_config_1.summary()

In [None]:
best_model_config_2.summary()

## res - different numbers of nerons with regularization techiniques (alpha tuned) - corr standard (method 3)

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_residential.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = StandardScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [8, 12, 16]
encoding_dims_options = [1, 2, 3, 4, 5]
results = []

# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations for reference and show model summary
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 4, Encoding dimension: 3, Neurons: 16, Test loss: 0.4787726402282715


Hidden layers: 6, Encoding dimension: 5, Neurons: 16, Test loss: 0.497527539730072


add orthogonal regularization (w/ alpha tuned)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

# Custom layer for orthogonal regularization
class OrthogonalRegularization(Layer):
    def __init__(self, alpha=1e-2, **kwargs):
        super(OrthogonalRegularization, self).__init__(**kwargs)
        self.alpha = alpha

    def call(self, bottleneck_output):
        # Get the batch size
        batch_size = tf.cast(tf.shape(bottleneck_output)[0], tf.float32)

        # Normalize the bottleneck outputs
        normalized_output = bottleneck_output / tf.sqrt(batch_size)

        # Compute correlation matrix
        correlation = tf.matmul(
            tf.transpose(normalized_output),
            normalized_output
        )

        # Create identity matrix of the correct shape
        shape = tf.shape(correlation)[0]
        identity = tf.eye(shape)

        # Calculate loss (excluding diagonal elements)
        mask = tf.ones_like(correlation) - tf.eye(shape)
        loss = tf.reduce_sum(tf.square(correlation * mask))

        # Add loss to the layer
        self.add_loss(self.alpha * loss)

        return bottleneck_output


In [None]:
def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Int('encoding_dim', min_value=2, max_value=5, step=1)
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # alpha tuned
    alpha = hp.Choice('alpha', [1e-3, 1e-2, 1e-1, 1.0])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer

    # Encoder layers
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer with orthogonal regularization
    bottleneck = Dense(
        encoding_dim,
        activation=activation,
        kernel_constraint=tf.keras.constraints.UnitNorm(axis=0),
        name='bottleneck'
    )(x)

    # Apply orthogonal regularization
    bottleneck = OrthogonalRegularization(alpha=alpha)(bottleneck) #modified alpha for tunin

    # Decoder layers
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)

    output_layer = Dense(input_dim, name='output_layer')(x)

    # Create and compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='mse'  # Using standard MSE loss
    )

    return model

In [None]:
from kerastuner.tuners import Hyperband
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Define callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        min_delta=1e-4
    )
]


In [None]:
tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=2, hidden_layers_after=2),
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='hyperpara_tuning_1_ort_ho_aaal',
    project_name='model_config_1'
)

tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=3, hidden_layers_after=3),
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='hyperpara_tuning_2_ort_ho_aaal',
    project_name='model_config_2',
    overwrite=True
)

In [None]:
# Run the tuning
tuner_config_1.search(
    X_train,
    X_train,
    epochs=50,
    validation_data=(X_test, X_test),
    callbacks=callbacks,
    batch_size=32,
    verbose=1
)

In [None]:
# Run the tuning
tuner_config_2.search(
    X_train,
    X_train,
    epochs=50,
    validation_data=(X_test, X_test),
    callbacks=callbacks,
    batch_size=32,
    verbose=1
)

Trial 90 Complete [00h 00m 22s]
val_loss: 0.48952168226242065

Best val_loss So Far: 0.36945563554763794
Total elapsed time: 00h 22m 24s


#### regularization

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 4, 'neurons': 9, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'alpha': 0.001, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0049'}

Test Loss: 0.42116695642471313
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step

Latent Feature Correlations:
[[ 1.          0.19574385  0.34091679 -0.25850344]
 [ 0.19574385  1.         -0.01460667 -0.13430074]
 [ 0.34091679 -0.01460667  1.          0.08144593]
 [-0.25850344 -0.13430074  0.08144593  1.        ]]


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_1.input, outputs=best_model_config_1.get_layer('bottleneck').output)

# Create a decoder model
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_1.layers if 'dense_decoder' in l.name]:
    x = best_model_config_1.get_layer(layer_name)(x)
decoder_output = best_model_config_1.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#calculate bottleneck contributions
def calculate_bottleneck_contributions(autoencoder, encoder, decoder, X):
    bottleneck_output = encoder.predict(X)
    contributions = []

    for dim in range(bottleneck_output.shape[1]):
        # Isolate one dimension at a time
        isolated_latent = np.zeros_like(bottleneck_output)
        isolated_latent[:, dim] = bottleneck_output[:, dim]

        # Reconstruct input using only the isolated latent dimension
        reconstructed = decoder.predict(isolated_latent)

        # Compute reconstruction loss (e.g., MSE)
        loss = mean_squared_error(X, reconstructed)
        contributions.append(loss)

    # Normalize contributions
    contributions = np.array(contributions)
    normalized_contributions = 1 - (contributions / np.sum(contributions))

    return normalized_contributions

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step




[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 54ms/step



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7957
Latent Dimension 2: 0.7104
Latent Dimension 3: 0.7155
Latent Dimension 4: 0.7784


In [None]:
# Normalize the contributions so they sum to 1
def normalize_contributions(contributions):
    total = sum(contributions)
    return [c / total for c in contributions]

# Calculate contributions
#contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.2652
Latent Dimension 2: 0.2368
Latent Dimension 3: 0.2385
Latent Dimension 4: 0.2595
Sum of Normalized Contributions: 1.0


In [None]:
normalized_contributions

[0.26524618572247743,
 0.2367890555818817,
 0.23848427740582467,
 0.25948048128981616]

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


In [None]:
latent_contributions = np.array(normalized_contributions)

# Calculate MI scores
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 4)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
latent_contributions

array([0.26524619, 0.23678906, 0.23848428, 0.25948048])

In [None]:
normalized_mi_scores

array([[0.01515752, 0.04344681, 0.10705445, 0.04080098],
       [0.03881546, 0.00881835, 0.01268705, 0.02942444],
       [0.02404602, 0.00827871, 0.01085959, 0.02392484],
       [0.05136616, 0.01921884, 0.02615977, 0.04440986],
       [0.01941352, 0.00858951, 0.01233614, 0.0486103 ],
       [0.05409721, 0.01793388, 0.        , 0.04150236],
       [0.00912314, 0.0764245 , 0.01076848, 0.06065779],
       [0.02273975, 0.10227275, 0.09987156, 0.03804776],
       [0.07799483, 0.11171129, 0.03973089, 0.08412775],
       [0.07283676, 0.07978642, 0.0315235 , 0.03583767],
       [0.06033529, 0.04812358, 0.03085356, 0.08893628],
       [0.05010548, 0.05428267, 0.        , 0.05592727],
       [0.03940379, 0.02494944, 0.08412399, 0.01787343],
       [0.03112605, 0.02603981, 0.04735304, 0.05508552],
       [0.05605788, 0.03219124, 0.05254708, 0.01771771],
       [0.04733839, 0.05114924, 0.05850563, 0.03035254],
       [0.03114441, 0.01501434, 0.09180902, 0.04365745],
       [0.02593845, 0.02865191,

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_se_AUTOENCODER_best_auto_config1_1208_with_weights_res_ss.xlsx", index=False)

config 2

In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 18, 'learning_rate': 0.01, 'batch_size': 16, 'activation': 'tanh', 'alpha': 0.01, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0067'}

Test Loss: 0.36945563554763794
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step

Latent Feature Correlations:
[[ 1.         -0.05741718 -0.23807442  0.08793147 -0.03243081]
 [-0.05741718  1.          0.2016903  -0.14877315  0.16399051]
 [-0.23807442  0.2016903   1.         -0.52844769  0.21047439]
 [ 0.08793147 -0.14877315 -0.52844769  1.         -0.11207877]
 [-0.03243081  0.16399051  0.21047439 -0.11207877  1.        ]]


In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_2.input, outputs=best_model_config_2.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_2.layers if 'dense_decoder' in l.name]:
    x = best_model_config_2.get_layer(layer_name)(x)
decoder_output = best_model_config_2.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_2, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.7905
Latent Dimension 2: 0.8653
Latent Dimension 3: 0.7955
Latent Dimension 4: 0.7739
Latent Dimension 5: 0.7747


In [None]:
# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.1976
Latent Dimension 2: 0.2163
Latent Dimension 3: 0.1989
Latent Dimension 4: 0.1935
Latent Dimension 5: 0.1937
Sum of Normalized Contributions: 1.0


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


In [None]:
latent_contributions = np.array(normalized_contributions)

mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_2.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_2.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Export to Excel
df.to_excel("real_se_AUTOENCODER_best_auto_config2_1215_with_weights_res_ss.xlsx", index=False)

In [None]:
best_model_config_1.summary()

In [None]:
best_model_config_2.summary()

## res - different numbers of nerons with regularization techiniques (alpha tuned) - corr minmax (method 3)

In [None]:
data = pd.read_excel('SE_Original_(NOT SHARED) FOR USE_2020_residential.xlsx')
X_ori = data.iloc[:, 3:27].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X_ori)

# Split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def create_autoencoder(input_dim, encoding_dim, hidden_layers_before, hidden_layers_after, neurons_before, neurons_after):
    # Define the input layer
    input_layer = Input(shape=(input_dim,), name='input_layer')

    # Build the encoder part
    x = input_layer
    for i in range(hidden_layers_before):
        x = Dense(neurons_before, activation='relu', name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer (encoding layer)
    bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(x)

    # Build the decoder part
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons_after, activation='relu', name=f'dense_decoder_{i+1}')(x)

    # Output layer, with the same dimension as the input
    output_layer = Dense(input_dim, name='output_layer')(x)

    # Define the complete autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')

    return autoencoder

In [None]:
# Initial tuning
hidden_layers_options = [2, 4, 6, 8]  # Even numbers to ensure symmetry
neurons_options = [8, 12, 16]
encoding_dims_options = [1, 2, 3, 4, 5]
results = []

# Iterate through each combination of hyperparameters
for hidden_layers in hidden_layers_options:
    hidden_layers_before = hidden_layers // 2
    hidden_layers_after = hidden_layers // 2

    for neurons in neurons_options:
        for encoding_dim in encoding_dims_options:
            autoencoder = create_autoencoder(
                input_dim=X_train.shape[1],
                encoding_dim=encoding_dim,
                hidden_layers_before=hidden_layers_before,
                hidden_layers_after=hidden_layers_after,
                neurons_before=neurons,
                neurons_after=neurons
            )
            autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')
            history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, verbose=0, validation_data=(X_test, X_test))

            # Calculate average loss on the test set
            test_loss = autoencoder.evaluate(X_test, X_test, verbose=0)
            results.append((hidden_layers, encoding_dim, neurons, test_loss))

# Sort results by the lowest test loss and select the best two configurations
sorted_results = sorted(results, key=lambda x: x[3])
best_configs = sorted_results[:2]

# Print best configurations
print("Top two configurations:")
for config in best_configs:
    hidden_layers, encoding_dim, neurons, test_loss = config
    print(f"Hidden layers: {hidden_layers}, Encoding dimension: {encoding_dim}, Neurons: {neurons}, Test loss: {test_loss}")

    # Recreate and display the model summary for the best configurations
    autoencoder = create_autoencoder(
        input_dim=X_train.shape[1],
        encoding_dim=encoding_dim,
        hidden_layers_before=hidden_layers // 2,
        hidden_layers_after=hidden_layers // 2,
        neurons_before=neurons,
        neurons_after=neurons
    )
    autoencoder.summary()

Top two configurations:
Hidden layers: 4, Encoding dimension: 5, Neurons: 16, Test loss: 0.02714536152780056


Hidden layers: 2, Encoding dimension: 3, Neurons: 16, Test loss: 0.03079284355044365


add orthogonal regularization (w/ alpha tuned)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

# Custom layer for orthogonal regularization
class OrthogonalRegularization(Layer):
    def __init__(self, alpha=1e-2, **kwargs):
        super(OrthogonalRegularization, self).__init__(**kwargs)
        self.alpha = alpha

    def call(self, bottleneck_output):
        # Get the batch size
        batch_size = tf.cast(tf.shape(bottleneck_output)[0], tf.float32)

        # Normalize the bottleneck outputs
        normalized_output = bottleneck_output / tf.sqrt(batch_size)

        # Compute correlation matrix
        correlation = tf.matmul(
            tf.transpose(normalized_output),
            normalized_output
        )

        # Create identity matrix of the correct shape
        shape = tf.shape(correlation)[0]
        identity = tf.eye(shape)

        # Calculate loss (excluding diagonal elements)
        mask = tf.ones_like(correlation) - tf.eye(shape)
        loss = tf.reduce_sum(tf.square(correlation * mask))

        # Add loss to the layer
        self.add_loss(self.alpha * loss)

        return bottleneck_output


In [None]:
def build_model(hp, input_dim, hidden_layers_before, hidden_layers_after):
    # Hyperparameters to tune
    encoding_dim = hp.Int('encoding_dim', min_value=2, max_value=5, step=1)
    neurons = hp.Int('neurons', min_value=6, max_value=24, step=3)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    batch_size = hp.Choice('batch_size', [16, 32, 64])
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # alpha tuned
    alpha = hp.Choice('alpha', [1e-3, 1e-2, 1e-1, 1.0])

    # Build the model structure
    input_layer = Input(shape=(input_dim,), name='input_layer')
    x = input_layer

    # Encoder layers
    for i in range(hidden_layers_before):
        x = Dense(neurons, activation=activation, name=f'dense_encoder_{i+1}')(x)

    # Bottleneck layer with orthogonal regularization
    bottleneck = Dense(
        encoding_dim,
        activation=activation,
        kernel_constraint=tf.keras.constraints.UnitNorm(axis=0),
        name='bottleneck'
    )(x)

    # Apply orthogonal regularization
    bottleneck = OrthogonalRegularization(alpha=alpha)(bottleneck) #modified alpha for tunin

    # Decoder layers
    x = bottleneck
    for i in range(hidden_layers_after):
        x = Dense(neurons, activation=activation, name=f'dense_decoder_{i+1}')(x)

    output_layer = Dense(input_dim, name='output_layer')(x)

    # Create and compile model
    model = Model(inputs=input_layer, outputs=output_layer, name='autoencoder_model')
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='mse'  # Using standard MSE loss now
    )

    return model

In [None]:
from kerastuner.tuners import Hyperband
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Define callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        min_delta=1e-4
    )
]


In [None]:
# Define search space for each configuration
tuner_config_1 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=2, hidden_layers_after=2),
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='hyperpara_tuning_1_ortho_aaaal',
    project_name='model_config_1'
)

# Define search space for each configuration
tuner_config_2 = Hyperband(
    lambda hp: build_model(hp, input_dim=24, hidden_layers_before=1, hidden_layers_after=1),
    objective='val_loss',
    max_epochs=50,
    factor=3,
    directory='hyperpara_tuning_2_ortho_aaaal',
    project_name='model_config_2',
    overwrite=True
)

In [None]:
# Run the tuning
tuner_config_1.search(
    X_train,
    X_train,
    epochs=50,
    validation_data=(X_test, X_test),
    callbacks=callbacks,
    batch_size=32,
    verbose=1
)

Trial 90 Complete [00h 00m 27s]
val_loss: 0.014893291518092155

Best val_loss So Far: 0.013995358720421791
Total elapsed time: 00h 35m 36s


In [None]:
# Run the tuning
tuner_config_2.search(
    X_train,
    X_train,
    epochs=50,
    validation_data=(X_test, X_test),
    callbacks=callbacks,
    batch_size=32,
    verbose=1
)

Trial 90 Complete [00h 00m 27s]
val_loss: 0.28906935453414917

Best val_loss So Far: 0.01418523769825697
Total elapsed time: 00h 30m 09s


#### regularization

In [None]:
# Get the best model
best_model_config_1 = tuner_config_1.get_best_models(num_models=1)[0]
best_hp_config_1 = tuner_config_1.get_best_hyperparameters(num_trials=1)[0]

In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_1.values)

# Evaluate final model
evaluation = best_model_config_1.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_1.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 15, 'learning_rate': 0.01, 'batch_size': 32, 'activation': 'tanh', 'alpha': 0.01, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0074'}

Test Loss: 0.013995358720421791
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step

Latent Feature Correlations:
[[ 1.         -0.46536019 -0.17613831  0.386343    0.1593035 ]
 [-0.46536019  1.          0.12871877  0.16985815 -0.01367104]
 [-0.17613831  0.12871877  1.         -0.13108603 -0.71509113]
 [ 0.386343    0.16985815 -0.13108603  1.          0.02495951]
 [ 0.1593035  -0.01367104 -0.71509113  0.02495951  1.        ]]


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_1.input, outputs=best_model_config_1.get_layer('bottleneck').output)

# Create a decoder model
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_1.layers if 'dense_decoder' in l.name]:
    x = best_model_config_1.get_layer(layer_name)(x)
decoder_output = best_model_config_1.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)


In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#calculate bottleneck contributions
def calculate_bottleneck_contributions(autoencoder, encoder, decoder, X):
    bottleneck_output = encoder.predict(X)
    contributions = []

    for dim in range(bottleneck_output.shape[1]):
        # Isolate one dimension at a time
        isolated_latent = np.zeros_like(bottleneck_output)
        isolated_latent[:, dim] = bottleneck_output[:, dim]

        # Reconstruct input using only the isolated latent dimension
        reconstructed = decoder.predict(isolated_latent)

        # Compute reconstruction loss (e.g., MSE)
        loss = mean_squared_error(X, reconstructed)
        contributions.append(loss)

    # Normalize contributions
    contributions = np.array(contributions)
    normalized_contributions = 1 - (contributions / np.sum(contributions))

    return normalized_contributions

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.8069
Latent Dimension 2: 0.7815
Latent Dimension 3: 0.7851
Latent Dimension 4: 0.7891
Latent Dimension 5: 0.8374


In [None]:
# Normalize the contributions so they sum to 1
def normalize_contributions(contributions):
    total = sum(contributions)
    return [c / total for c in contributions]

# Calculate contributions
#contributions = calculate_bottleneck_contributions(best_model_config_1, encoder, decoder, X_train)

# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify they sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.2017
Latent Dimension 2: 0.1954
Latent Dimension 3: 0.1963
Latent Dimension 4: 0.1973
Latent Dimension 5: 0.2093
Sum of Normalized Contributions: 1.0


In [None]:
normalized_contributions

[0.2017209275480649,
 0.1953778954574629,
 0.19627121162009706,
 0.19728387606967646,
 0.20934608930469867]

In [None]:
bottleneck_layer_config_1 = best_model_config_1.get_layer('bottleneck').output
encoder_model_config_1 = Model(inputs=best_model_config_1.input, outputs=bottleneck_layer_config_1)
bottleneck_output_1 = encoder_model_config_1.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


In [None]:
latent_contributions = np.array(normalized_contributions)

# Calculate MI scores (as you have done)
mi_scores = []
for i in range(bottleneck_output_1.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_1[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_1.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_1.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
latent_contributions

array([0.20172093, 0.1953779 , 0.19627121, 0.19728388, 0.20934609])

In [None]:
normalized_mi_scores

array([[0.01061809, 0.04500234, 0.1841634 , 0.02868507, 0.00264859],
       [0.034778  , 0.02973155, 0.0175726 , 0.0091104 , 0.01922639],
       [0.02159798, 0.        , 0.02811489, 0.0065976 , 0.04262017],
       [0.05439743, 0.04675682, 0.08121578, 0.03555676, 0.04973733],
       [0.01632044, 0.05920884, 0.03681562, 0.02004051, 0.0769772 ],
       [0.03846599, 0.05177864, 0.01978123, 0.02037407, 0.01624436],
       [0.02700845, 0.0138545 , 0.08366913, 0.01874064, 0.10841944],
       [0.04339917, 0.02219113, 0.02837722, 0.11118942, 0.04457535],
       [0.10355425, 0.06314486, 0.07429056, 0.12223077, 0.09346068],
       [0.07685915, 0.        , 0.03212188, 0.05236498, 0.01881084],
       [0.06984075, 0.05250568, 0.0676716 , 0.05597111, 0.08803421],
       [0.06071397, 0.04516453, 0.02626451, 0.01693235, 0.04702483],
       [0.02449276, 0.04617539, 0.0340543 , 0.03401939, 0.        ],
       [0.03118851, 0.02612447, 0.01898988, 0.02386067, 0.03067785],
       [0.0600746 , 0.03176965, 0.

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Excel
df.to_excel("real_se_AUTOENCODER_best_auto_config1_1208_with_weights_res_mm.xlsx", index=False)

config 2

In [None]:
# Get the best model
best_model_config_2 = tuner_config_2.get_best_models(num_models=1)[0]
best_hp_config_2 = tuner_config_2.get_best_hyperparameters(num_trials=1)[0]

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Print best hyperparameters
print("\nBest Hyperparameters:")
print(best_hp_config_2.values)

# Evaluate final model
evaluation = best_model_config_2.evaluate(X_test, X_test, verbose=0)
print(f"\nTest Loss: {evaluation}")

# Extract and analyze bottleneck features
bottleneck_layer = best_model_config_2.get_layer('bottleneck').output
encoder = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer)
bottleneck_features = encoder.predict(X_test)

# Check correlation between latent features
correlation_matrix = np.corrcoef(bottleneck_features.T)
print("\nLatent Feature Correlations:")
print(correlation_matrix)


Best Hyperparameters:
{'encoding_dim': 5, 'neurons': 12, 'learning_rate': 0.01, 'batch_size': 64, 'activation': 'tanh', 'alpha': 0.1, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0047'}

Test Loss: 0.01418523769825697
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

Latent Feature Correlations:
[[ 1.         -0.03277693  0.38937205  0.21014547 -0.35259869]
 [-0.03277693  1.          0.66381462  0.10107188  0.18910883]
 [ 0.38937205  0.66381462  1.         -0.12258518 -0.16552352]
 [ 0.21014547  0.10107188 -0.12258518  1.          0.02782786]
 [-0.35259869  0.18910883 -0.16552352  0.02782786  1.        ]]


In [None]:
# Extract encoder and decoder from the trained autoencoder
encoder = Model(inputs=best_model_config_2.input, outputs=best_model_config_2.get_layer('bottleneck').output)

# Create a decoder model (assuming symmetric architecture)
bottleneck_input = tf.keras.Input(shape=(encoder.output.shape[1],))
x = bottleneck_input
for layer_name in [l.name for l in best_model_config_2.layers if 'dense_decoder' in l.name]:
    x = best_model_config_2.get_layer(layer_name)(x)
decoder_output = best_model_config_2.get_layer('output_layer')(x)
decoder = Model(inputs=bottleneck_input, outputs=decoder_output)

In [None]:
# Calculate contributions
contributions = calculate_bottleneck_contributions(best_model_config_2, encoder, decoder, X_train)

# Print contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.8806
Latent Dimension 2: 0.7722
Latent Dimension 3: 0.7909
Latent Dimension 4: 0.7751
Latent Dimension 5: 0.7813


In [None]:
# Normalize contributions
normalized_contributions = normalize_contributions(contributions)

# Print normalized contributions
print("Normalized Contributions of Each Bottleneck Dimension:")
for i, c in enumerate(normalized_contributions):
    print(f"Latent Dimension {i+1}: {c:.4f}")

# Verify sum to 1
print("Sum of Normalized Contributions:", sum(normalized_contributions))

Normalized Contributions of Each Bottleneck Dimension:
Latent Dimension 1: 0.2202
Latent Dimension 2: 0.1931
Latent Dimension 3: 0.1977
Latent Dimension 4: 0.1938
Latent Dimension 5: 0.1953
Sum of Normalized Contributions: 0.9999999999999998


In [None]:
bottleneck_layer_config_2 = best_model_config_2.get_layer('bottleneck').output
encoder_model_config_2 = Model(inputs=best_model_config_2.input, outputs=bottleneck_layer_config_2)
bottleneck_output_2 = encoder_model_config_2.predict(X)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 


In [None]:
latent_contributions = np.array(normalized_contributions)  # Make sure this is normalized

mi_scores = []
for i in range(bottleneck_output_2.shape[1]):
    mi = mutual_info_regression(X, bottleneck_output_2[:, i], random_state=42)
    mi_scores.append(mi)

In [None]:
mi_scores = np.array(mi_scores).T  # Transpose to (features, bottleneck_dim)
mi_scores.shape

(24, 5)

In [None]:
# Normalize MI scores per bottleneck dimension
normalized_mi_scores = mi_scores / np.sum(mi_scores, axis=0)

# Initialize an array to store weighted values
weighted_values = np.zeros((X.shape[0], X.shape[1], bottleneck_output_2.shape[1]))

# Multiply MI scores by latent space contributions for each bottleneck dimension
for dim in range(bottleneck_output_2.shape[1]):
    weighted_values[:, :, dim] = X * normalized_mi_scores[:, dim] * latent_contributions[dim]

In [None]:
# Sum across bottleneck dimensions for a single weighted value per feature
summed_features = np.sum(weighted_values, axis=2)

# Sum across features to get the final index
final_index = np.sum(summed_features, axis=1)

# Reshape and append final index as a new column in X
final_index_column = final_index.reshape(-1, 1)
X_with_index = np.hstack((X, final_index_column))

# Create DataFrame and export to Excel
column_names = [f"Feature_{i+1}" for i in range(X.shape[1])] + ["Final_Index"]
df = pd.DataFrame(X_with_index, columns=column_names)

In [None]:
# Excel
df.to_excel("real_se_AUTOENCODER_best_auto_config2_1208_with_weights_res_mm.xlsx", index=False)

In [None]:
best_model_config_1.summary()

In [None]:
best_model_config_2.summary()