In [1]:
import numpy as np
import pandas as pd
import os
import json

In [2]:
labels = np.load("../Data/preprocessed/labels.npy", allow_pickle=True)
embeddings = np.load("../Data/preprocessed/embeddings.npy", allow_pickle=True)

In [3]:
labels.shape, embeddings.shape

((4572, 4), (4572, 1280))

In [4]:
embeddings.dtype, labels.dtype

(dtype('float32'), dtype('int64'))

In [5]:
# Separate "all 0" samples
all_zero_indices = np.where(np.sum(labels, axis=1) == 0)[0]
embeddings_all_zero = embeddings[all_zero_indices]
labels_all_zero = labels[all_zero_indices]

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
embeddings_resampled, labels_resampled = smote.fit_resample(embeddings, labels)

embeddings_resampled_final = np.concatenate([embeddings_resampled, embeddings_all_zero], axis=0)
labels_resampled_final = np.concatenate([labels_resampled, labels_all_zero], axis=0)

In [6]:
pd.DataFrame(labels).value_counts()

0  1  2  3
0  0  0  0    2439
      1  0     910
1  0  0  0     500
0  0  0  1     420
   1  0  0     303
Name: count, dtype: int64

In [7]:
pd.DataFrame(labels_resampled).value_counts()

0  1  2  3
0  0  0  1    2939
      1  0    2939
   1  0  0    2939
1  0  0  0    2939
Name: count, dtype: int64

In [8]:
pd.DataFrame(labels_resampled_final).value_counts()

0  1  2  3
0  0  0  1    2939
      1  0    2939
   1  0  0    2939
1  0  0  0    2939
0  0  0  0    2439
Name: count, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(embeddings_resampled, labels_resampled, test_size=0.2, random_state=42)

In [10]:
from tensorflow.keras.layers import Input, Dense, concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from sklearn.metrics import accuracy_score, f1_score

# --- Create the multi-label classification model ---
input_layer = Input(shape=(X_train.shape[1],))  # Input shape based on embeddings
hidden_layer1 = Dense(512, activation='relu')(input_layer)
dropout_layer = Dropout(0.5)(hidden_layer1)  # Add dropout for regularization
hidden_layer2 = Dense(256, activation='relu')(dropout_layer)
dropout_layer2 = Dropout(0.5)(hidden_layer2)  # Add dropout for regularization
hidden_layer3 = Dense(128, activation='relu')(dropout_layer2)
dropout_layer3 = Dropout(0.5)(hidden_layer3)  # Add dropout for regularization
hidden_layer4 = Dense(64, activation='relu')(dropout_layer3)
output_layer = Dense(y_train.shape[1], activation='softmax')(hidden_layer4)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
model.summary()

In [12]:
# callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=20)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)
checkpoint = ModelCheckpoint(monitor='val_loss', filepath='model.keras', save_best_only=True)
callbacks=[early_stopping, learning_rate_reduction]

In [13]:
# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=16, validation_split=0.2, callbacks=callbacks)

# Evaluate the model
predictions = model.predict(X_test)
predicted_labels = (predictions > 0.5).astype(int)  # Threshold predictions

accuracy = accuracy_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels, average='micro')  # Use appropriate averaging

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Epoch 1/200
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.2637 - loss: 3.3326 - val_accuracy: 0.3264 - val_loss: 1.3630 - learning_rate: 1.0000e-04
Epoch 2/200
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.3284 - loss: 1.5633 - val_accuracy: 0.3987 - val_loss: 1.3165 - learning_rate: 1.0000e-04
Epoch 3/200
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.3349 - loss: 1.4156 - val_accuracy: 0.4450 - val_loss: 1.2644 - learning_rate: 1.0000e-04
Epoch 4/200
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.3572 - loss: 1.3493 - val_accuracy: 0.4790 - val_loss: 1.2113 - learning_rate: 1.0000e-04
Epoch 5/200
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.4101 - loss: 1.2753 - val_accuracy: 0.5019 - val_loss: 1.1581 - learning_rate: 1.0000e-04
Epoch 6/200
[1m471/471[0m [32m━━━━━━━━━━━━

In [14]:
predictions

array([[8.3957217e-04, 9.9915457e-01, 3.5989186e-07, 5.4803659e-06],
       [1.4907392e-03, 1.2964567e-09, 9.9850923e-01, 1.0635946e-12],
       [1.6386169e-03, 9.9829960e-01, 2.0508267e-06, 5.9671020e-05],
       ...,
       [2.3300707e-04, 4.1905928e-06, 1.0196604e-07, 9.9976265e-01],
       [2.4783953e-05, 9.9997526e-01, 4.7292609e-10, 4.9134012e-08],
       [5.3734247e-02, 6.1301886e-05, 9.4620001e-01, 4.3867944e-06]],
      dtype=float32)

In [15]:
predicted_labels

array([[0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       ...,
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 1, 0]])

In [16]:
y_test

array([[0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       ...,
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 1, 0]])