In [2]:
import pandas as pd
import torch
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, hamming_loss

from xgboost import XGBClassifier
from skmultilearn.problem_transform import ClassifierChain

In [3]:
df_train = pd.read_pickle("..\\data\\processed\\df_train.pkl")
df_test = pd.read_pickle("..\\data\\processed\\df_test.pkl")

text_embeddings_train = torch.load("..\\data\\processed\\rocov2_captions_embeddings_train.pt")
text_embeddings_test = torch.load("..\\data\\processed\\rocov2_captions_embeddings_test.pt")

image_embeddings_train = torch.load("..\\data\\processed\\rocov2_image_embeddings_train.pt")
image_embeddings_test = torch.load("..\\data\\processed\\rocov2_image_embeddings_test.pt")

# optiimzation for better memory management
text_embeddings_train = text_embeddings_train.detach().cpu().numpy().astype("float32")
text_embeddings_test = text_embeddings_test.detach().cpu().numpy().astype("float32")

image_embeddings_train = image_embeddings_train.detach().cpu().numpy().astype("float32")
image_embeddings_test = image_embeddings_test.detach().cpu().numpy().astype("float32")


# get merged embeddings
combined_embeddings_train = np.concatenate((text_embeddings_train, image_embeddings_train), axis=1)
combined_embeddings_test = np.concatenate((text_embeddings_test, image_embeddings_test), axis=1)

combined_embeddings_train = combined_embeddings_train.astype("float32")
combined_embeddings_test = combined_embeddings_test.astype("float32")

In [4]:
# initialize pca and mlb
pca = PCA(n_components=300, random_state=42)
mlb = MultiLabelBinarizer()

In [5]:
# Binarize labels
y_train = mlb.fit_transform(df_train['Semantic_vec'])
y_test = mlb.transform(df_test['Semantic_vec'])

# Early-fusion

In [6]:
# PCA for dimensionality reduction
X_train = pca.fit_transform(combined_embeddings_train)
X_test = pca.transform(combined_embeddings_test)


base_xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=3,  
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

cc = ClassifierChain(classifier=base_xgb, require_dense=[True, True])

print("Training Classifier Chain with XGBoost...")
cc.fit(X_train, y_train)
print("Predicting...")
y_pred = cc.predict(X_test)

print("Early fusion results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score (micro):", f1_score(y_test, y_pred, average='micro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))
print("F1 Score (samples):", f1_score(y_test, y_pred, average='samples'))
print("Precision (micro):", precision_score(y_test, y_pred, average='micro'))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (micro):", recall_score(y_test, y_pred, average='micro'))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
print("Hamming Loss:", hamming_loss(y_test, y_pred))


Training Classifier Chain with XGBoost...
Predicting...
Early fusion results:
Accuracy: 0.39721605809965704
F1 Score (micro): 0.8028785711189145
F1 Score (macro): 0.34865536549750364
F1 Score (weighted): 0.7662468504278253
F1 Score (samples): 0.8031263879912256
Precision (micro): 0.8256062767475035
Precision (macro): 0.7517065081064888
Recall (micro): 0.7813686608725002
Recall (macro): 0.31104936797961763
Hamming Loss: 0.057330542666935644


As we can see, the classificator gives good overral results, but macro is very low. This is due to the very imbalanced dataset. This classification can be upgraded with hyperparameter optimization.

# Late fusion

In [6]:
X_text_train = pca.fit_transform(text_embeddings_train)
X_text_test = pca.transform(text_embeddings_test)

X_image_train = pca.fit_transform(image_embeddings_train)
X_image_test = pca.transform(image_embeddings_test)

base_xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=3,  
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

cc_text = ClassifierChain(classifier=base_xgb, require_dense=[True, True])
cc_image = ClassifierChain(classifier=base_xgb, require_dense=[True, True])

print("Training Classifier Chains for Late Fusion...")
print("Training text model...")
cc_text.fit(X_text_train, y_train)
y_text_pred = cc_text.predict_proba(X_text_test)

print("Training image model...")
cc_image.fit(X_image_train, y_train)
y_image_pred = cc_image.predict_proba(X_image_test)


Training Classifier Chains for Late Fusion...
Training text model...
Training image model...


## Average fusion

In [7]:
# Average predictions
y_pred_avg = (y_text_pred + y_image_pred) / 2
y_pred_avg = (y_pred_avg >= 0.5).astype(int) # Convert probabilities to binary predictions

print("Late Fusion Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_avg))
print("F1 Score (micro):", f1_score(y_test, y_pred_avg, average='micro'))
print("F1 Score (macro):", f1_score(y_test, y_pred_avg, average='macro'))
print("F1 Score (weighted):", f1_score(y_test, y_pred_avg, average='weighted'))
print("F1 Score (samples):", f1_score(y_test, y_pred_avg, average='samples'))
print("Precision (micro):", precision_score(y_test, y_pred_avg, average='micro'))
print("Precision (macro):", precision_score(y_test, y_pred_avg, average='macro'))
print("Recall (micro):", recall_score(y_test, y_pred_avg, average='micro'))
print("Recall (macro):", recall_score(y_test, y_pred_avg, average='macro'))
print("Hamming Loss:", hamming_loss(y_test, y_pred_avg))


Late Fusion Results:
Accuracy: 0.35626386927577164
F1 Score (micro): 0.7895647659427187
F1 Score (macro): 0.2596513395533108
F1 Score (weighted): 0.7364824816274427
F1 Score (samples): 0.784368476546205
Precision (micro): 0.8101749089940513
Precision (macro): 0.5736018816231122
Recall (micro): 0.7699772171124799
Recall (macro): 0.2504638873798563
Hamming Loss: 0.06132741577567077


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Weighted average fusion

In [11]:
y_pred_weighted = (0.7 * y_text_pred + 0.3 * y_image_pred) / 1
y_pred_weighted = (y_pred_weighted >= 0.5).astype(int) # Convert probabilities to binary predictions

print("Late Fusion Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_weighted))
print("F1 Score (micro):", f1_score(y_test, y_pred_weighted, average='micro'))
print("F1 Score (macro):", f1_score(y_test, y_pred_weighted, average='macro'))
print("F1 Score (weighted):", f1_score(y_test, y_pred_weighted, average='weighted'))
print("F1 Score (samples):", f1_score(y_test, y_pred_weighted, average='samples'))
print("Precision (micro):", precision_score(y_test, y_pred_weighted, average='micro'))
print("Precision (macro):", precision_score(y_test, y_pred_weighted, average='macro'))
print("Recall (micro):", recall_score(y_test, y_pred_weighted, average='micro'))
print("Recall (macro):", recall_score(y_test, y_pred_weighted, average='macro'))
print("Hamming Loss:", hamming_loss(y_test, y_pred_weighted))

Late Fusion Results:
Accuracy: 0.3650393383094614
F1 Score (micro): 0.7909075357112308
F1 Score (macro): 0.33255306894885983
F1 Score (weighted): 0.752012242959549
F1 Score (samples): 0.7874766061723899
Precision (micro): 0.801881883618073
Precision (macro): 0.7668664985357759
Recall (micro): 0.7802295164964982
Recall (macro): 0.29990970378739545
Hamming Loss: 0.0616426265886625


## Meta-classifier fusion

In [12]:
preds_text_test = cc_text.predict_proba(X_text_test)
preds_image_test = cc_image.predict_proba(X_image_test)
train_base = np.hstack((cc_text.predict_proba(X_text_train).toarray(), cc_image.predict_proba(X_image_train).toarray()))
test_base = np.hstack((preds_text_test.toarray(), preds_image_test.toarray()))

meta_xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=3,
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
cc_meta = ClassifierChain(classifier=meta_xgb, require_dense=[True, True])
print("Training Meta-classifier...")
cc_meta.fit(train_base, y_train)
print("Predicting with Meta-classifier...")
y_meta_pred = cc_meta.predict(test_base)
print("Meta-classifier Fusion Results:")
print("Accuracy:", accuracy_score(y_test, y_meta_pred))
print("F1 Score (micro):", f1_score(y_test, y_meta_pred, average='micro'))
print("F1 Score (macro):", f1_score(y_test, y_meta_pred,    average='macro'))
print("F1 Score (weighted):", f1_score(y_test, y_meta_pred, average='weighted'))
print("F1 Score (samples):", f1_score(y_test, y_meta_pred, average='samples'))
print("Precision (micro):", precision_score(y_test, y_meta_pred, average='micro'))
print("Precision (macro):", precision_score(y_test, y_meta_pred, average='macro'))
print("Recall (micro):", recall_score(y_test, y_meta_pred, average='micro'))
print("Recall (macro):", recall_score(y_test, y_meta_pred, average='macro'))
print("Hamming Loss:", hamming_loss(y_test, y_meta_pred))

Training Meta-classifier...


: 