In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.layers import Input, Conv1D, Conv1DTranspose, Flatten, Dense, Reshape, MaxPooling1D, BatchNormalization, Dropout

In [None]:
df = pd.read_csv('Data Fusion Assignment 2025 Data.csv')

In [None]:
df.head()

Unnamed: 0,class,id,color,transparency,device_id,spectrum_0 + 740,spectrum_1 + 740,spectrum_2 + 740,spectrum_3 + 740,spectrum_4 + 740,...,sample_raw_321 + 740,sample_raw_322 + 740,sample_raw_323 + 740,sample_raw_324 + 740,sample_raw_325 + 740,sample_raw_326 + 740,sample_raw_327 + 740,sample_raw_328 + 740,sample_raw_329 + 740,sample_raw_330 + 740
0,1,1,white,o,B0236F1F2D02C632,0.782157,0.754932,0.730555,0.709195,0.690707,...,5.54,5.32,5.08,4.8,4.5,4.16,3.8,3.42,3.0,2.55
1,1,2,white,o,B0236F1F2D02C632,0.948156,0.947904,0.947607,0.947237,0.946815,...,6.08,5.84,5.57,5.27,4.94,4.57,4.18,3.75,3.29,2.8
2,1,4,none,s,B0236F1F2D02C632,0.285657,0.285247,0.284841,0.284432,0.284022,...,1.47,1.42,1.35,1.28,1.2,1.11,1.01,0.91,0.8,0.68
3,1,5,none,s,B0236F1F2D02C632,0.29628,0.295966,0.295631,0.295269,0.294887,...,1.65,1.58,1.51,1.43,1.34,1.24,1.13,1.02,0.89,0.76
4,1,6,none,t,B0236F1F2D02C632,0.171568,0.171535,0.1715,0.171459,0.171414,...,1.31,1.26,1.2,1.14,1.07,0.99,0.9,0.81,0.71,0.61


In [None]:
df_without_class_8 = df[df['class'] != 8]

In [None]:
cat_df = df_without_class_8.loc[:,['transparency', 'color']]
cat_df.head()

Unnamed: 0,transparency,color
0,o,white
1,o,white
2,s,none
3,s,none
4,t,none


In [None]:
cat_df = pd.get_dummies(cat_df, drop_first=True).astype(int)
print(cat_df.shape)
cat_df.head()

(363, 11)


Unnamed: 0,transparency_s,transparency_t,color_blue,color_gray,color_green,color_none,color_orange,color_purple,color_red,color_white,color_yellow
0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,1,0,0,0,0,0


In [None]:
transparency_labels = cat_df.columns[cat_df.columns.str.startswith('transparency_')]
color_labels = cat_df.columns[cat_df.columns.str.startswith('color_')]

In [None]:
transparency_df = cat_df[transparency_labels]
color_df = cat_df[color_labels]

## Categorical embeddings

#### Transparency embeddings
Remember to create the folder called "weights". Also use GPU

In [None]:
transparency_train, transparency_test = train_test_split(transparency_df, test_size=0.5, random_state=42)
print("Transparency Training Shape:", transparency_train.shape)

Transparency Training Shape: (181, 2)


In [None]:
    initial_learning_rate = 0.001
    lr_schedule = ExponentialDecay(
      initial_learning_rate=initial_learning_rate,
      decay_steps=100000,
      decay_rate=0.96,
      staircase=True
  )

In [None]:
best_embedding_dim = 0
best_loss = float('inf')
best_autoencoder = None

for transparency_embedding_dim in range(1, 7):
    print(f"\nTraining with embedding dimension: {transparency_embedding_dim}")

    transparency_input = Input(shape=(transparency_train.shape[1],))
    transparency_encoded = Dense(transparency_embedding_dim, activation='relu',  kernel_initializer=HeNormal())(transparency_input)
    transparency_decoded = Dense(transparency_train.shape[1], activation='softmax')(transparency_encoded)

    transparency_autoencoder = Model(transparency_input, transparency_decoded)
    transparency_autoencoder.compile(optimizer=Adam(learning_rate=lr_schedule), loss='categorical_crossentropy')

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = transparency_autoencoder.fit(
        transparency_train, transparency_train,
        validation_split=0.1,
        epochs=200, batch_size=32, verbose=0,
        callbacks=[early_stopping]
    )

    loss = min(history.history['loss'])
    val_loss = min(history.history['val_loss'])
    print(f"Training Loss achieved: {loss:.4f}, Validation Loss achieved: {val_loss:.4f}")


    if val_loss < best_loss:
      best_loss = val_loss
      best_embedding_dim = transparency_embedding_dim
      best_autoencoder = transparency_autoencoder

print(f"\nBest embedding dimension: {best_embedding_dim} with lowest validation loss: {best_loss:.4f}")


Training with embedding dimension: 1
Training Loss achieved: 0.1623, Validation Loss achieved: 0.1608

Training with embedding dimension: 2
Training Loss achieved: 0.0823, Validation Loss achieved: 0.0817

Training with embedding dimension: 3
Training Loss achieved: 0.1788, Validation Loss achieved: 0.1352

Training with embedding dimension: 4
Training Loss achieved: 0.1678, Validation Loss achieved: 0.1254

Training with embedding dimension: 5
Training Loss achieved: 0.0905, Validation Loss achieved: 0.0937

Training with embedding dimension: 6
Training Loss achieved: 0.0993, Validation Loss achieved: 0.0763

Best embedding dimension: 6 with lowest validation loss: 0.0763


In [None]:
weights_file_path = "best_transparency_autoencoder_weights.weights.h5"
best_autoencoder.save_weights(weights_file_path)

#### Color embedding

In [None]:
color_train, color_test = train_test_split(color_df, test_size=0.5, random_state=42)
print("Color Training Shape:", color_train.shape)

Color Training Shape: (181, 9)


In [None]:
best_embedding_dim_color = 0
best_loss_color = float('inf')
best_autoencoder_color = None

for color_embedding_dim in range(1, 9):
    print(f"\nTraining with embedding dimension: {color_embedding_dim}")

    color_input = Input(shape=(color_train.shape[1],))
    color_encoded = Dense(color_embedding_dim, activation='relu',  kernel_initializer=HeNormal())(color_input)
    color_decoded = Dense(color_train.shape[1], activation='softmax')(color_encoded)

    color_autoencoder = Model(color_input, color_decoded)
    color_autoencoder.compile(optimizer=Adam(learning_rate=lr_schedule), loss='categorical_crossentropy')

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = color_autoencoder.fit(
        color_train, color_train,
        validation_split=0.1,
        epochs=200, batch_size=32, verbose=0,
        callbacks=[early_stopping]
    )

    loss = min(history.history['loss'])
    val_loss = min(history.history['val_loss'])
    print(f"Training Loss achieved: {loss:.4f}, Validation Loss achieved: {val_loss:.4f}")


    if val_loss < best_loss_color:
        best_loss_color = val_loss
        best_embedding_dim_color = color_embedding_dim
        best_autoencoder_color = color_autoencoder

print(f"\nBest embedding dimension for color: {best_embedding_dim_color} with lowest validation loss: {best_loss_color:.4f}")


Training with embedding dimension: 1
Training Loss achieved: 1.4949, Validation Loss achieved: 1.2679

Training with embedding dimension: 2
Training Loss achieved: 0.8017, Validation Loss achieved: 0.6932

Training with embedding dimension: 3
Training Loss achieved: 0.4610, Validation Loss achieved: 0.3043

Training with embedding dimension: 4
Training Loss achieved: 0.3459, Validation Loss achieved: 0.2662

Training with embedding dimension: 5
Training Loss achieved: 0.3063, Validation Loss achieved: 0.2801

Training with embedding dimension: 6
Training Loss achieved: 0.3245, Validation Loss achieved: 0.2557

Training with embedding dimension: 7
Training Loss achieved: 0.1959, Validation Loss achieved: 0.0858

Training with embedding dimension: 8
Training Loss achieved: 0.1157, Validation Loss achieved: 0.0770

Best embedding dimension for color: 8 with lowest validation loss: 0.0770


In [None]:
weights_file_path = "best_color_autoencoder_weights.weights.h5"
best_autoencoder_color.save_weights(weights_file_path)

## Spectrum embedding

In [None]:
spectrum_columns = [col for col in df_without_class_8.columns if col.startswith('spectrum')]
spectrum_df = df_without_class_8[spectrum_columns]
spectrum_df.head()

Unnamed: 0,spectrum_0 + 740,spectrum_1 + 740,spectrum_2 + 740,spectrum_3 + 740,spectrum_4 + 740,spectrum_5 + 740,spectrum_6 + 740,spectrum_7 + 740,spectrum_8 + 740,spectrum_9 + 740,...,spectrum_321 + 740,spectrum_322 + 740,spectrum_323 + 740,spectrum_324 + 740,spectrum_325 + 740,spectrum_326 + 740,spectrum_327 + 740,spectrum_328 + 740,spectrum_329 + 740,spectrum_330 + 740
0,0.782157,0.754932,0.730555,0.709195,0.690707,0.67489,0.661358,0.649536,0.638722,0.628352,...,0.763257,0.763487,0.763256,0.762691,0.762037,0.761381,0.761238,0.761424,0.761624,0.76141
1,0.948156,0.947904,0.947607,0.947237,0.946815,0.946384,0.945992,0.945662,0.945352,0.945005,...,0.837685,0.838095,0.837967,0.837447,0.836808,0.836148,0.836038,0.83628,0.836527,0.836312
2,0.285657,0.285247,0.284841,0.284432,0.284022,0.283619,0.283231,0.282856,0.282474,0.282062,...,0.202998,0.203067,0.203011,0.202864,0.202691,0.202516,0.202477,0.202525,0.202576,0.202517
3,0.29628,0.295966,0.295631,0.295269,0.294887,0.294499,0.294121,0.293758,0.293394,0.293011,...,0.226905,0.227035,0.227019,0.226896,0.226739,0.226576,0.22656,0.226638,0.226717,0.226668
4,0.171568,0.171535,0.1715,0.171459,0.171414,0.171371,0.171338,0.171317,0.171298,0.171272,...,0.180999,0.181116,0.181115,0.181026,0.180908,0.180784,0.180777,0.180844,0.180909,0.180873


In [None]:
spectrum_train, spectrum_test = train_test_split(spectrum_df, test_size=0.5, random_state=42)
print("Spectrum Training Shape:", spectrum_train.shape)

Spectrum Training Shape: (181, 331)


In [None]:
input_layer = Input(shape=(spectrum_train.shape[1], 1))

# autoencoder
x = Conv1D(32, 3, activation='relu', padding='same')(input_layer)
x = BatchNormalization()(x)
x = MaxPooling1D(2, padding='same')(x)
x = Dropout(0.2)(x)

x = Conv1D(64, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(2, padding='same')(x)
x = Dropout(0.2)(x)

x = Conv1D(128, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(2, padding='same')(x)
x = Dropout(0.2)(x)

encoded = Flatten()(x)

# embedding
x = Dense(spectrum_train.shape[1] * 128, activation='relu')(encoded)
x = Reshape((spectrum_train.shape[1], 128))(x)

x = Conv1DTranspose(128, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Conv1DTranspose(64, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Conv1DTranspose(32, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

decoded = Conv1DTranspose(1, 3, activation='linear', padding='same')(x)

autoencoder = Model(input_layer, decoded)

autoencoder.compile(optimizer='adam', loss='mean_squared_error')


In [None]:
autoencoder.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

autoencoder.fit(
    spectrum_train.values, spectrum_train.values,
    validation_split=0.1,
    epochs=200,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr]
)

weights_file_path = "cnn_autoencoder_weights.weights.h5"
autoencoder.save_weights(weights_file_path)
print(f"CNN Autoencoder weights saved to {weights_file_path}")

Epoch 1/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - loss: 2.5966 - val_loss: 0.4532 - learning_rate: 0.0010
Epoch 2/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 322ms/step - loss: 1.5148 - val_loss: 0.3914 - learning_rate: 0.0010
Epoch 3/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 273ms/step - loss: 1.2624 - val_loss: 0.3434 - learning_rate: 0.0010
Epoch 4/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 272ms/step - loss: 1.0626 - val_loss: 0.3077 - learning_rate: 0.0010
Epoch 5/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 272ms/step - loss: 0.9183 - val_loss: 0.2964 - learning_rate: 0.0010
Epoch 6/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 274ms/step - loss: 0.8608 - val_loss: 0.2375 - learning_rate: 0.0010
Epoch 7/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 275ms/step - loss: 0.7344 - val_loss: 0.2151 - learning_rate: 0.0010
E

## Using the embeddings

In [None]:
# run this code if the code below doesn't work
# transparency_input = Input(shape=(transparency_train.shape[1],))
# transparency_encoded = Dense(transparency_embedding_dim, activation='relu',  kernel_initializer=HeNormal())(transparency_input)
# transparency_decoded = Dense(transparency_train.shape[1], activation='softmax')(transparency_encoded)

# best_autoencoder = Model(transparency_input, transparency_decoded)
# best_autoencoder.compile(optimizer=Adam(learning_rate=lr_schedule), loss='categorical_crossentropy')

In [None]:
best_autoencoder.load_weights("embedding_weights/best_transparency_autoencoder_weights.weights.h5")# adjust path if needed

transparency_encoder = Model(
    inputs=best_autoencoder.input,
    outputs=best_autoencoder.layers[1].output
)

# get the embedding for the trianing/test data
embedded_data = transparency_encoder.predict(transparency_train)

In [None]:
# same here
# color_input = Input(shape=(color_train.shape[1],))
# color_encoded = Dense(color_embedding_dim, activation='relu',  kernel_initializer=HeNormal())(color_input)
# color_decoded = Dense(color_train.shape[1], activation='softmax')(color_encoded)

# best_autoencoder_color = Model(color_input, color_decoded)
# best_autoencoder_color.compile(optimizer=Adam(learning_rate=lr_schedule), loss='categorical_crossentropy')

In [None]:
best_autoencoder_color.load_weights("embedding_weights/best_color_autoencoder_weights.weights.h5")# adjust path if needed

color_encoder = Model(
    inputs=best_autoencoder_color.input,
    outputs=best_autoencoder_color.layers[1].output
)

# get the embedding for the trianing/test data
color_embeddings = color_encoder.predict(color_train)

In [None]:
#for this one you can run the code with model compilation - evertyhing should work after that
autoencoder.load_weights("embedding_weights/cnn_autoencoder_weights.weights.h5")# adjust path if needed

encoder = Model(
    inputs=autoencoder.input,
    outputs=autoencoder.get_layer(index=-6).output
)

# get the embedding for the trianing/test data
spectrum_embeddings = encoder.predict(spectrum_train)