In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.layers import Input, Conv1D, Conv1DTranspose, Flatten, Dense, Reshape, MaxPooling1D, BatchNormalization, Dropout

In [None]:
df = pd.read_csv('Data Fusion Assignment 2025 Data.csv')

In [None]:
df.head()

Unnamed: 0,class,id,color,transparency,device_id,spectrum_0 + 740,spectrum_1 + 740,spectrum_2 + 740,spectrum_3 + 740,spectrum_4 + 740,...,sample_raw_321 + 740,sample_raw_322 + 740,sample_raw_323 + 740,sample_raw_324 + 740,sample_raw_325 + 740,sample_raw_326 + 740,sample_raw_327 + 740,sample_raw_328 + 740,sample_raw_329 + 740,sample_raw_330 + 740
0,1,1,white,o,B0236F1F2D02C632,0.782157,0.754932,0.730555,0.709195,0.690707,...,5.54,5.32,5.08,4.8,4.5,4.16,3.8,3.42,3.0,2.55
1,1,2,white,o,B0236F1F2D02C632,0.948156,0.947904,0.947607,0.947237,0.946815,...,6.08,5.84,5.57,5.27,4.94,4.57,4.18,3.75,3.29,2.8
2,1,4,none,s,B0236F1F2D02C632,0.285657,0.285247,0.284841,0.284432,0.284022,...,1.47,1.42,1.35,1.28,1.2,1.11,1.01,0.91,0.8,0.68
3,1,5,none,s,B0236F1F2D02C632,0.29628,0.295966,0.295631,0.295269,0.294887,...,1.65,1.58,1.51,1.43,1.34,1.24,1.13,1.02,0.89,0.76
4,1,6,none,t,B0236F1F2D02C632,0.171568,0.171535,0.1715,0.171459,0.171414,...,1.31,1.26,1.2,1.14,1.07,0.99,0.9,0.81,0.71,0.61


In [None]:
df.shape

(373, 998)

In [None]:
df_without_class_8 = df[df['class'] != 8]

In [None]:
cat_df = df_without_class_8.loc[:,['transparency', 'color']]
cat_df.head()

Unnamed: 0,transparency,color
0,o,white
1,o,white
2,s,none
3,s,none
4,t,none


In [None]:
cat_df = pd.get_dummies(cat_df, drop_first=True).astype(int)
print(cat_df.shape)
cat_df.head()

(363, 11)


Unnamed: 0,transparency_s,transparency_t,color_blue,color_gray,color_green,color_none,color_orange,color_purple,color_red,color_white,color_yellow
0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,1,0,0,0,0,0


In [None]:
transparency_labels = cat_df.columns[cat_df.columns.str.startswith('transparency_')]
color_labels = cat_df.columns[cat_df.columns.str.startswith('color_')]

In [None]:
transparency_df = cat_df[transparency_labels]
color_df = cat_df[color_labels]

## Categorical embeddings

#### Transparency embeddings
Remember to create the folder called "weights". Also use GPU

In [None]:
transparency_train, transparency_test = train_test_split(transparency_df, test_size=0.5, random_state=42)
print("Transparency Training Shape:", transparency_train.shape)

Transparency Training Shape: (181, 2)


In [None]:
initial_learning_rate = 0.001
lr_schedule = ExponentialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True
)

In [None]:
best_embedding_dim = 0
best_loss = float('inf')
best_autoencoder = None

for transparency_embedding_dim in range(1, 7):
    print(f"\nTraining with embedding dimension: {transparency_embedding_dim}")

    transparency_input = Input(shape=(transparency_train.shape[1],))
    transparency_encoded = Dense(transparency_embedding_dim, activation='relu',  kernel_initializer=HeNormal())(transparency_input)
    transparency_decoded = Dense(transparency_train.shape[1], activation='softmax')(transparency_encoded)

    transparency_autoencoder = Model(transparency_input, transparency_decoded)
    transparency_autoencoder.compile(optimizer=Adam(learning_rate=lr_schedule), loss='categorical_crossentropy')

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = transparency_autoencoder.fit(
        transparency_train, transparency_train,
        validation_split=0.1,
        epochs=200, batch_size=32, verbose=0,
        callbacks=[early_stopping]
    )

    loss = min(history.history['loss'])
    val_loss = min(history.history['val_loss'])
    print(f"Training Loss achieved: {loss:.4f}, Validation Loss achieved: {val_loss:.4f}")


    if val_loss < best_loss:
      best_loss = val_loss
      best_embedding_dim = transparency_embedding_dim
      best_autoencoder = transparency_autoencoder

print(f"\nBest embedding dimension: {best_embedding_dim} with lowest validation loss: {best_loss:.4f}")


Training with embedding dimension: 1
Training Loss achieved: 0.2599, Validation Loss achieved: 0.2159

Training with embedding dimension: 2
Training Loss achieved: 0.0664, Validation Loss achieved: 0.0651

Training with embedding dimension: 3
Training Loss achieved: 0.1024, Validation Loss achieved: 0.0762

Training with embedding dimension: 4
Training Loss achieved: 0.0290, Validation Loss achieved: 0.0213

Training with embedding dimension: 5
Training Loss achieved: 0.0262, Validation Loss achieved: 0.0244

Training with embedding dimension: 6
Training Loss achieved: 0.0226, Validation Loss achieved: 0.0173

Best embedding dimension: 6 with lowest validation loss: 0.0173


In [None]:
weights_file_path = "best_transparency_autoencoder_weights.weights.h5"
best_autoencoder.save_weights(weights_file_path)

#### Color embedding

In [None]:
color_train, color_test = train_test_split(color_df, test_size=0.5, random_state=42)
print("Color Training Shape:", color_train.shape)

Color Training Shape: (181, 9)


In [None]:
best_embedding_dim_color = 0
best_loss_color = float('inf')
best_autoencoder_color = None

for color_embedding_dim in range(1, 9):
    print(f"\nTraining with embedding dimension: {color_embedding_dim}")

    color_input = Input(shape=(color_train.shape[1],))
    color_encoded = Dense(color_embedding_dim, activation='relu',  kernel_initializer=HeNormal())(color_input)
    color_decoded = Dense(color_train.shape[1], activation='softmax')(color_encoded)

    color_autoencoder = Model(color_input, color_decoded)
    color_autoencoder.compile(optimizer=Adam(learning_rate=lr_schedule), loss='categorical_crossentropy')

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = color_autoencoder.fit(
        color_train, color_train,
        validation_split=0.1,
        epochs=200, batch_size=32, verbose=0,
        callbacks=[early_stopping]
    )

    loss = min(history.history['loss'])
    val_loss = min(history.history['val_loss'])
    print(f"Training Loss achieved: {loss:.4f}, Validation Loss achieved: {val_loss:.4f}")


    if val_loss < best_loss_color:
        best_loss_color = val_loss
        best_embedding_dim_color = color_embedding_dim
        best_autoencoder_color = color_autoencoder

print(f"\nBest embedding dimension for color: {best_embedding_dim_color} with lowest validation loss: {best_loss_color:.4f}")


Training with embedding dimension: 1
Training Loss achieved: 1.1542, Validation Loss achieved: 1.0158

Training with embedding dimension: 2
Training Loss achieved: 0.8757, Validation Loss achieved: 0.7959

Training with embedding dimension: 3
Training Loss achieved: 0.6223, Validation Loss achieved: 0.4640

Training with embedding dimension: 4
Training Loss achieved: 0.6552, Validation Loss achieved: 0.3506

Training with embedding dimension: 5
Training Loss achieved: 0.3110, Validation Loss achieved: 0.3101

Training with embedding dimension: 6
Training Loss achieved: 0.2736, Validation Loss achieved: 0.3609

Training with embedding dimension: 7
Training Loss achieved: 0.1786, Validation Loss achieved: 0.1316

Training with embedding dimension: 8
Training Loss achieved: 0.1438, Validation Loss achieved: 0.0648

Best embedding dimension for color: 8 with lowest validation loss: 0.0648


In [None]:
weights_file_path = "best_color_autoencoder_weights.weights.h5"
best_autoencoder_color.save_weights(weights_file_path)

## Spectrum embedding

In [None]:
spectrum_columns = [col for col in df_without_class_8.columns if col.startswith('spectrum')]
spectrum_df = df_without_class_8[spectrum_columns]
spectrum_df.head()

Unnamed: 0,spectrum_0 + 740,spectrum_1 + 740,spectrum_2 + 740,spectrum_3 + 740,spectrum_4 + 740,spectrum_5 + 740,spectrum_6 + 740,spectrum_7 + 740,spectrum_8 + 740,spectrum_9 + 740,...,spectrum_321 + 740,spectrum_322 + 740,spectrum_323 + 740,spectrum_324 + 740,spectrum_325 + 740,spectrum_326 + 740,spectrum_327 + 740,spectrum_328 + 740,spectrum_329 + 740,spectrum_330 + 740
0,0.782157,0.754932,0.730555,0.709195,0.690707,0.67489,0.661358,0.649536,0.638722,0.628352,...,0.763257,0.763487,0.763256,0.762691,0.762037,0.761381,0.761238,0.761424,0.761624,0.76141
1,0.948156,0.947904,0.947607,0.947237,0.946815,0.946384,0.945992,0.945662,0.945352,0.945005,...,0.837685,0.838095,0.837967,0.837447,0.836808,0.836148,0.836038,0.83628,0.836527,0.836312
2,0.285657,0.285247,0.284841,0.284432,0.284022,0.283619,0.283231,0.282856,0.282474,0.282062,...,0.202998,0.203067,0.203011,0.202864,0.202691,0.202516,0.202477,0.202525,0.202576,0.202517
3,0.29628,0.295966,0.295631,0.295269,0.294887,0.294499,0.294121,0.293758,0.293394,0.293011,...,0.226905,0.227035,0.227019,0.226896,0.226739,0.226576,0.22656,0.226638,0.226717,0.226668
4,0.171568,0.171535,0.1715,0.171459,0.171414,0.171371,0.171338,0.171317,0.171298,0.171272,...,0.180999,0.181116,0.181115,0.181026,0.180908,0.180784,0.180777,0.180844,0.180909,0.180873


In [None]:
spectrum_train, spectrum_test = train_test_split(spectrum_df, test_size=0.5, random_state=42)
print("Spectrum Training Shape:", spectrum_train.shape)

Spectrum Training Shape: (181, 331)


In [None]:
input_layer = Input(shape=(spectrum_train.shape[1], 1))

#encoder
x = Conv1D(32, 3, activation='relu', padding='same')(input_layer)
x = BatchNormalization()(x)
x = MaxPooling1D(2, padding='same')(x)
x = Dropout(0.2)(x)

x = Conv1D(64, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(2, padding='same')(x)
x = Dropout(0.2)(x)

x = Conv1D(128, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(2, padding='same')(x)
x = Dropout(0.2)(x)

x = Flatten()(x)

embedding_dim = 64
embedding = Dense(embedding_dim, activation='relu', name='embedding')(x)

#decoder
x = Dense(spectrum_train.shape[1] * 128, activation='relu')(embedding)
x = Reshape((spectrum_train.shape[1], 128))(x)

x = Conv1DTranspose(128, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Conv1DTranspose(64, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Conv1DTranspose(32, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

decoded = Conv1DTranspose(1, 3, activation='linear', padding='same')(x)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
autoencoder.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

autoencoder.fit(
    spectrum_train.values, spectrum_train.values,
    validation_split=0.1,
    epochs=200,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr]
)

weights_file_path = "cnn_autoencoder_weights.weights.h5"
autoencoder.save_weights(weights_file_path)
print(f"CNN Autoencoder weights saved to {weights_file_path}")

Epoch 1/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - loss: 3.1422 - val_loss: 0.4027 - learning_rate: 0.0010
Epoch 2/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 1.6569 - val_loss: 0.3642 - learning_rate: 0.0010
Epoch 3/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 1.3005 - val_loss: 0.3240 - learning_rate: 0.0010
Epoch 4/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 1.1563 - val_loss: 0.2829 - learning_rate: 0.0010
Epoch 5/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.9995 - val_loss: 0.2450 - learning_rate: 0.0010
Epoch 6/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.8886 - val_loss: 0.2200 - learning_rate: 0.0010
Epoch 7/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.8420 - val_loss: 0.2034 - learning_rate: 0.0010
Epoch 8

## Using the embeddings

In [None]:
best_autoencoder.load_weights("embedding_weights/best_transparency_autoencoder_weights.weights.h5")# adjust path if needed

transparency_encoder = Model(
    inputs=best_autoencoder.input,
    outputs=best_autoencoder.layers[1].output
)

# get the embedding for the training/test data
embedded_data = transparency_encoder.predict(transparency_train)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step


In [None]:
best_autoencoder_color.load_weights("embedding_weights/best_color_autoencoder_weights.weights.h5")# adjust path if needed

color_encoder = Model(
    inputs=best_autoencoder_color.input,
    outputs=best_autoencoder_color.layers[1].output
)

# get the embedding for the training/test data
color_embeddings = color_encoder.predict(color_train)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


In [None]:
#for this one you can run the code with model compilation - evertyhing should work after that
autoencoder.load_weights("embedding_weights/cnn_autoencoder_weights.weights.h5")# adjust path if needed

encoder = Model(
    inputs=autoencoder.input,
    outputs=autoencoder.get_layer("embedding").output
)

# get the embedding for the trianing/test data
spectrum_embeddings = encoder.predict(spectrum_train)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step


In [None]:
transparency_final_embeddings = transparency_encoder.predict(transparency_test)
color_final_embeddings = color_encoder.predict(color_test)
spectrum_final_embeddings = encoder.predict(spectrum_test)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


In [None]:
transparency_dim = transparency_final_embeddings.shape[1]
color_dim = color_final_embeddings.shape[1]
spectrum_dim = spectrum_final_embeddings.shape[1]

In [None]:
df_without_class_8_one, df_without_class_8_two = train_test_split(df_without_class_8, test_size=0.5, random_state=42)

In [None]:
df_without_class_8_two.shape

(182, 998)

In [None]:
final_embeddings = np.concatenate(
    [transparency_final_embeddings, color_final_embeddings, spectrum_final_embeddings], axis=1
)

In [None]:
transparency_cols = [f"tra_{i}" for i in range(transparency_final_embeddings.shape[1])]
color_cols = [f"col_{i}" for i in range(color_final_embeddings.shape[1])]
spectrum_cols = [f"spec_{i}" for i in range(spectrum_final_embeddings.shape[1])]

In [None]:
final_column_names = transparency_cols + color_cols + spectrum_cols
final_df = pd.DataFrame(final_embeddings, columns=final_column_names)
final_df.shape

(182, 78)

In [None]:
test_indices = transparency_test.index
test_indices

Index([163,  33,  15, 307,  57, 182,  76, 119, 268, 126,
       ...
       329, 275, 176, 345, 183, 256, 129,  38,  11, 237],
      dtype='int64', length=182)

In [None]:
indices_are_same = transparency_test.index.equals(color_test.index) and color_test.index.equals(spectrum_test.index)
print("Are the indices the same for all three splits (transparency, color, and spectrum)?", indices_are_same)

Are the indices the same for all three splits (transparency, color, and spectrum)? True


In [None]:
final_df['class'] = df_without_class_8.loc[test_indices, 'class'].values

In [None]:
final_df.shape

(182, 79)

In [None]:
final_df.head()

Unnamed: 0,tra_0,tra_1,tra_2,tra_3,tra_4,tra_5,col_0,col_1,col_2,col_3,...,spec_55,spec_56,spec_57,spec_58,spec_59,spec_60,spec_61,spec_62,spec_63,class
0,1.647261,0.0,0.49977,0.0,0.0,1.429867,1.413038,0.0,0.624375,0.000215,...,2.915505,0.189236,0.0,5.650387,0.0,7.001348,0.0,0.0,6.914174,3
1,2.659821,0.107053,0.117101,0.0,0.0,3.175856,1.285649,0.0,1.972758,0.076204,...,4.377558,0.928185,0.0,4.521344,0.0,7.8737,0.0,0.0,5.727788,1
2,1.647261,0.0,0.49977,0.0,0.0,1.429867,0.000706,0.280037,1.100661,0.000864,...,3.352116,0.937106,7.630889,0.0,5.519491,0.0,7.920612,15.73612,0.0,1
3,2.659821,0.107053,0.117101,0.0,0.0,3.175856,1.285649,0.0,1.972758,0.076204,...,4.317843,0.934757,0.0,3.776154,0.0,7.305614,0.0,0.0,4.785975,6
4,1.647261,0.0,0.49977,0.0,0.0,1.429867,0.007454,0.0,0.48693,1.420493,...,4.405597,0.948783,0.0,4.495588,0.0,7.887575,0.0,0.0,5.69496,1


In [None]:
final_df.to_csv('final_embeddings_with_class.csv', index=False)