In [None]:
import pandas as pd
import torch
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report,  precision_recall_fscore_support
import time

In [24]:
df = pd.read_pickle("..\\data\\interim\\train_interim_filtered_singlelabel_selectedcuis.pkl")

text_embedding = torch.load("..\\data\\interim\\rocov2_captions_embeddings_train_filtered_vec.pt")
image_embedding = torch.load("..\\data\\interim\\rocov2_image_embeddings_train_filtered_vec.pt") 

text_embedding = text_embedding.detach().cpu().numpy().astype("float32")
image_embedding = image_embedding.detach().cpu().numpy().astype("float32")

combined_embedding = np.concatenate((text_embedding, image_embedding), axis=1)
combined_embedding = combined_embedding.astype("float32")

In [25]:
# initialize pca and mlb
pca = PCA(n_components=300, random_state=42)
mlb = MultiLabelBinarizer()

# Early-fusion

In [None]:
# PCA for dimensionality reduction
# try without pca first
encoder = LabelEncoder()
smote = SMOTE(random_state=42, )
X = combined_embedding
y = encoder.fit_transform(df['CUI']) # since single-label now

resample = smote.fit_resample(X, y)
# X, y = resample


classes = np.unique(y)
class_weights = compute_class_weight("balanced", classes=classes, y=y)
weight_dict = {c: w for c, w in zip(classes, class_weights)}

k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_prec, all_rec, all_f1, all_support = [], [], [], []
for train_idx, test_idx in k_fold.split(X, y):
    # X_train, X_test = X[train_idx], X[test_idx]
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    param_grid = {
        "n_estimators": [200, 400, 600],
        "learning_rate": [0.05, 0.1],
        "max_depth": [4, 6],
        "min_child_weight": [1, 3, 5],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "gamma": [0, 0.5],
    }

 


    base_xgb = XGBClassifier(
        objective="multi:softprob",
        num_class= len(np.unique(y)),  # number of unique labels
        eval_metric="logloss",  
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    # xgb = GridSearchCV(base_xgb, param_grid, scoring='f1-macro', cv=3, verbose=1, n_jobs=10)
    print(f"Fold {len(all_prec) + 1}")
    time_start = time.time()
    print("\tTraining Classifier Chain with XGBoost...")
    sample_weights = np.array([weight_dict[label] for label in y_train])
    base_xgb.fit(X_train, y_train, sample_weight=sample_weights)
    print("\tPredicting...")
    y_pred = base_xgb.predict(X_test)
    elapsed = time.time() - time_start
    print(f"\tTime taken: {elapsed:.2f} seconds")
    

    print(classification_report(y_test, y_pred, target_names=[str(c) for c in base_xgb.classes_]))
        
    
    prec, rec, f1, support = precision_recall_fscore_support(
        y_test, y_pred, average=None, labels=np.unique(y)
    )
    
    all_prec.append(prec)
    all_rec.append(rec)
    all_f1.append(f1)
    all_support.append(support)

report_df = pd.DataFrame({
    "class": np.unique(y),
    "precision_mean": np.mean(all_prec, axis=0),
    "precision_std": np.std(all_prec, axis=0),
    "recall_mean": np.mean(all_rec, axis=0),
    "recall_std": np.std(all_rec, axis=0),
    "f1_mean": np.mean(all_f1, axis=0),
    "f1_std": np.std(all_f1, axis=0),
    "support_mean": np.mean(all_support, axis=0),  # utile per contare i campioni mediamente visti
})

report_df

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>class</th>
      <th>precision_mean</th>
      <th>precision_std</th>
      <th>recall_mean</th>
      <th>recall_std</th>
      <th>f1_mean</th>
      <th>f1_std</th>
      <th>support_mean</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0</td>
      <td>0.475681</td>
      <td>0.068852</td>
      <td>0.380168</td>
      <td>0.032514</td>
      <td>0.421913</td>
      <td>0.046148</td>
      <td>34.2</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>0.500282</td>
      <td>0.043770</td>
      <td>0.577778</td>
      <td>0.073030</td>
      <td>0.535885</td>
      <td>0.056341</td>
      <td>45.0</td>
    </tr>
    <tr>
      <th>2</th>
      <td>2</td>
      <td>0.852099</td>
      <td>0.034899</td>
      <td>0.859930</td>
      <td>0.041701</td>
      <td>0.855631</td>
      <td>0.034486</td>
      <td>41.4</td>
    </tr>
    <tr>
      <th>3</th>
      <td>3</td>
      <td>0.791366</td>
      <td>0.068073</td>
      <td>0.666912</td>
      <td>0.078494</td>
      <td>0.721246</td>
      <td>0.062430</td>
      <td>16.8</td>
    </tr>
    <tr>
      <th>4</th>
      <td>4</td>
      <td>0.730129</td>
      <td>0.066258</td>
      <td>0.618182</td>
      <td>0.093597</td>
      <td>0.667518</td>
      <td>0.076724</td>
      <td>22.0</td>
    </tr>
    <tr>
      <th>5</th>
      <td>5</td>
      <td>0.946810</td>
      <td>0.015883</td>
      <td>0.904637</td>
      <td>0.033890</td>
      <td>0.924986</td>
      <td>0.022456</td>
      <td>31.4</td>
    </tr>
    <tr>
      <th>6</th>
      <td>6</td>
      <td>0.865439</td>
      <td>0.072591</td>
      <td>0.799465</td>
      <td>0.087364</td>
      <td>0.827187</td>
      <td>0.055474</td>
      <td>33.8</td>
    </tr>
    <tr>
      <th>7</th>
      <td>7</td>
      <td>0.942377</td>
      <td>0.016560</td>
      <td>0.950725</td>
      <td>0.045462</td>
      <td>0.946130</td>
      <td>0.028216</td>
      <td>69.0</td>
    </tr>
    <tr>
      <th>8</th>
      <td>8</td>
      <td>0.832184</td>
      <td>0.084595</td>
      <td>0.800000</td>
      <td>0.105497</td>
      <td>0.815517</td>
      <td>0.095076</td>
      <td>29.0</td>
    </tr>
    <tr>
      <th>9</th>
      <td>9</td>
      <td>0.853164</td>
      <td>0.065612</td>
      <td>0.918326</td>
      <td>0.037751</td>
      <td>0.883043</td>
      <td>0.041662</td>
      <td>51.4</td>
    </tr>
    <tr>
      <th>10</th>
      <td>10</td>
      <td>0.787216</td>
      <td>0.026604</td>
      <td>0.828000</td>
      <td>0.057411</td>
      <td>0.806293</td>
      <td>0.034703</td>
      <td>50.0</td>
    </tr>
    <tr>
      <th>11</th>
      <td>11</td>
      <td>0.748315</td>
      <td>0.086560</td>
      <td>0.840693</td>
      <td>0.025559</td>
      <td>0.788585</td>
      <td>0.043705</td>
      <td>21.4</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
class metrics:
    def __init__(self):
        self.precision = []
        self.recall = []
        self.f1 = []
        self.support = []

    def compute(self, y_true, y_pred):
        prec, rec, f1, support = precision_recall_fscore_support(
            y_true, y_pred, average=None, labels=np.unique(y)
        )
        
        self.precision.append(prec)
        self.recall.append(rec)
        self.f1.append(f1)
        self.support.append(support)
    
    def to_dataframe(self):
        return pd.DataFrame({
            "class": np.unique(y),
            "precision_mean": np.mean(self.precision, axis=0),
            "precision_std": np.std(self.precision, axis=0),
            "recall_mean": np.mean(self.recall, axis=0),
            "recall_std": np.std(self.recall, axis=0),
            "f1_mean": np.mean(self.f1, axis=0),
            "f1_std": np.std(self.f1, axis=0),
            "support_mean": np.mean(self.support, axis=0),  # utile per contare i campioni mediamente visti
        })
        

# Late fusion

In [29]:
X_text = text_embedding
X_image = image_embedding

encoder = LabelEncoder()
y = encoder.fit_transform(df['CUI']) 


classes = np.unique(y)
class_weights = compute_class_weight("balanced", classes=classes, y=y)
weight_dict = {c: w for c, w in zip(classes, class_weights)}

k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

txt_metrics = metrics()
img_metrics = metrics()

fusion_avg_metrics = metrics()
fusion_weighted_avg_metrics = metrics()
fusion_meta_classifier_metrics = metrics()

pca = PCA(n_components=300, random_state=42)

time_start = time.time()

for train_idx, test_idx in k_fold.split(X_text, y):
    X_text_train, X_text_test = pca.fit_transform(X_text[train_idx]), pca.transform(X_text[test_idx])
    X_image_train, X_image_test = pca.fit_transform(X_image[train_idx]), pca.transform(X_image[test_idx])
    y_train, y_test = y[train_idx], y[test_idx]
    
    base_xgb_txt = XGBClassifier(
        objective="multi:softprob",
        num_class= len(np.unique(y)),  # number of unique labels
        eval_metric="logloss",  
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    base_xgb_img = XGBClassifier(
        objective="multi:softprob",
        num_class= len(np.unique(y)),  # number of unique labels
        eval_metric="logloss",  
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    
    print(f"Fold {len(txt_metrics.precision) + 1}")
    
    sample_weights = np.array([weight_dict[label] for label in y_train])
    
    print("\tTraining Text XGBoost...")
    time_start = time.time()
    base_xgb_txt.fit(X_text_train, y_train, sample_weight=sample_weights)
    
    print("\tTraining Image XGBoost...")
    base_xgb_img.fit(X_image_train, y_train, sample_weight=sample_weights)
    
    print("\tPredicting...")
    y_pred_txt_proba = base_xgb_txt.predict_proba(X_text_test)
    y_pred_img_proba = base_xgb_img.predict_proba(X_image_test)
    
    y_pred_txt = np.argmax(y_pred_txt_proba, axis=1)
    y_pred_img = np.argmax(y_pred_img_proba, axis=1)
    

    elapsed = time.time() - time_start    
    print(f"\tTime taken: {elapsed:.2f} seconds")
    
    txt_metrics.compute(y_test, y_pred_txt)
    img_metrics.compute(y_test, y_pred_img)
    
    
    # Fusion method 1: average the probabilities
    y_pred_fusion = (y_pred_txt_proba + y_pred_img_proba) / 2
    y_pred_fusion = np.argmax(y_pred_fusion, axis=1)
    
    fusion_avg_metrics.compute(y_test, y_pred_fusion)
    
    # Fusion method 2: weighted average the probabilities based on individual model performance
    weight_txt = 0.6
    weight_img = 0.4
    
    y_pred_fusion_weighted = (weight_txt * y_pred_txt_proba + weight_img * y_pred_img_proba) / (weight_txt + weight_img)
    y_pred_fusion_weighted = np.argmax(y_pred_fusion_weighted, axis=1)
    
    fusion_weighted_avg_metrics.compute(y_test, y_pred_fusion_weighted)
    
    # Fusion method 3: meta-classifier
    X_meta_train = np.hstack((base_xgb_txt.predict_proba(X_text_train), base_xgb_img.predict_proba(X_image_train)))
    X_meta_test = np.hstack((y_pred_txt_proba, y_pred_img_proba))
    
    meta_lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
    
    time_start = time.time()
    
    print("\tTraining Meta-classifier XGBoost...")
    meta_lr.fit(X_meta_train, y_train, sample_weight=sample_weights)
    
    print("\tPredicting with Meta-classifier...")
    y_pred_meta_proba = meta_lr.predict_proba(X_meta_test)
    y_pred_meta = np.argmax(y_pred_meta_proba, axis=1)
    
    elapsed = time.time() - time_start
    print(f"\tTime taken: {elapsed:.2f} seconds")
    
    
    fusion_meta_classifier_metrics.compute(y_test, y_pred_meta)
    

print("Text Model Performance:")
display(txt_metrics.to_dataframe())

print("Image Model Performance:")
display(img_metrics.to_dataframe())

print("Fusion Average Model Performance:")
display(fusion_avg_metrics.to_dataframe())

print("Fusion Weighted Average Model Performance:")
display(fusion_weighted_avg_metrics.to_dataframe())

print("Fusion Meta-classifier Model Performance:")
display(fusion_meta_classifier_metrics.to_dataframe())

Fold 1
	Training Text XGBoost...
	Training Image XGBoost...
	Predicting...
	Time taken: 41.89 seconds
	Training Meta-classifier XGBoost...
	Predicting with Meta-classifier...
	Time taken: 0.01 seconds
Fold 2
	Training Text XGBoost...
	Training Image XGBoost...
	Predicting...
	Time taken: 47.90 seconds
	Training Meta-classifier XGBoost...
	Predicting with Meta-classifier...
	Time taken: 0.01 seconds
Fold 3
	Training Text XGBoost...
	Training Image XGBoost...
	Predicting...
	Time taken: 51.00 seconds
	Training Meta-classifier XGBoost...
	Predicting with Meta-classifier...
	Time taken: 0.01 seconds
Fold 4
	Training Text XGBoost...
	Training Image XGBoost...
	Predicting...
	Time taken: 51.67 seconds
	Training Meta-classifier XGBoost...
	Predicting with Meta-classifier...
	Time taken: 0.01 seconds
Fold 5
	Training Text XGBoost...
	Training Image XGBoost...
	Predicting...
	Time taken: 51.53 seconds
	Training Meta-classifier XGBoost...
	Predicting with Meta-classifier...
	Time taken: 0.01 sec

Unnamed: 0,class,precision_mean,precision_std,recall_mean,recall_std,f1_mean,f1_std,support_mean
0,0,0.458571,0.125317,0.315798,0.077541,0.371126,0.092687,34.2
1,1,0.540452,0.020745,0.6,0.062854,0.567081,0.032994,45.0
2,2,0.837279,0.050221,0.855168,0.026032,0.845626,0.034779,41.4
3,3,0.893613,0.075661,0.677941,0.06315,0.768805,0.055812,16.8
4,4,0.785792,0.119035,0.554545,0.078203,0.640936,0.045987,22.0
5,5,0.91428,0.051493,0.911089,0.022706,0.911332,0.019271,31.4
6,6,0.881932,0.052637,0.775223,0.047391,0.823844,0.038829,33.8
7,7,0.91921,0.034939,0.947826,0.025269,0.933122,0.027777,69.0
8,8,0.804054,0.038564,0.806897,0.103678,0.804021,0.07081,29.0
9,9,0.812641,0.052794,0.91825,0.033726,0.861938,0.04303,51.4


Image Model Performance:


Unnamed: 0,class,precision_mean,precision_std,recall_mean,recall_std,f1_mean,f1_std,support_mean
0,0,0.289015,0.115537,0.158655,0.092497,0.196482,0.097157,34.2
1,1,0.312446,0.039983,0.324444,0.022662,0.316883,0.0241,45.0
2,2,0.422983,0.058449,0.439489,0.033793,0.430493,0.045092,41.4
3,3,0.526429,0.108556,0.558088,0.080022,0.538628,0.086122,16.8
4,4,0.418465,0.138,0.172727,0.034015,0.239424,0.043883,22.0
5,5,0.498587,0.100378,0.413508,0.076364,0.451208,0.085387,31.4
6,6,0.330736,0.095488,0.129947,0.034681,0.186079,0.049888,33.8
7,7,0.627649,0.034233,0.8,0.06176,0.702661,0.039954,69.0
8,8,0.152574,0.089667,0.117241,0.071004,0.132224,0.079104,29.0
9,9,0.452362,0.029119,0.642006,0.038414,0.530607,0.032129,51.4


Fusion Average Model Performance:


Unnamed: 0,class,precision_mean,precision_std,recall_mean,recall_std,f1_mean,f1_std,support_mean
0,0,0.44119,0.124503,0.239832,0.065679,0.309884,0.084117,34.2
1,1,0.532671,0.031968,0.582222,0.055154,0.555534,0.038197,45.0
2,2,0.822862,0.058311,0.874564,0.022795,0.847215,0.039031,41.4
3,3,0.892647,0.108026,0.797059,0.049566,0.838289,0.056535,16.8
4,4,0.809211,0.126473,0.5,0.099586,0.610796,0.088942,22.0
5,5,0.848479,0.017021,0.923992,0.041951,0.883866,0.017538,31.4
6,6,0.939501,0.027643,0.727807,0.050517,0.819226,0.036029,33.8
7,7,0.843425,0.0448,0.947826,0.019659,0.891648,0.023096,69.0
8,8,0.84437,0.054847,0.703448,0.094056,0.764225,0.06701,29.0
9,9,0.726405,0.05972,0.910483,0.040253,0.806809,0.044332,51.4


Fusion Weighted Average Model Performance:


Unnamed: 0,class,precision_mean,precision_std,recall_mean,recall_std,f1_mean,f1_std,support_mean
0,0,0.462977,0.092079,0.257479,0.068609,0.329172,0.075462,34.2
1,1,0.544963,0.009729,0.608889,0.06532,0.573984,0.034674,45.0
2,2,0.849172,0.055325,0.893728,0.024747,0.869711,0.031048,41.4
3,3,0.894566,0.067988,0.785294,0.061414,0.834481,0.049474,16.8
4,4,0.810836,0.117242,0.527273,0.106017,0.629137,0.080456,22.0
5,5,0.881105,0.017804,0.942944,0.023072,0.910857,0.017609,31.4
6,6,0.914055,0.041509,0.751515,0.057414,0.824001,0.046938,33.8
7,7,0.877761,0.026318,0.947826,0.0284,0.910892,0.01582,69.0
8,8,0.838342,0.039487,0.751724,0.100888,0.789586,0.065022,29.0
9,9,0.772005,0.054537,0.922323,0.042347,0.839821,0.045124,51.4


Fusion Meta-classifier Model Performance:


Unnamed: 0,class,precision_mean,precision_std,recall_mean,recall_std,f1_mean,f1_std,support_mean
0,0,0.44119,0.124503,0.239832,0.065679,0.309884,0.084117,34.2
1,1,0.528336,0.033014,0.582222,0.055154,0.553223,0.039044,45.0
2,2,0.822862,0.058311,0.874564,0.022795,0.847215,0.039031,41.4
3,3,0.892647,0.108026,0.797059,0.049566,0.838289,0.056535,16.8
4,4,0.819505,0.129283,0.490909,0.109091,0.605113,0.098883,22.0
5,5,0.853353,0.013808,0.923992,0.041951,0.88657,0.017588,31.4
6,6,0.939747,0.027868,0.73369,0.058911,0.82281,0.041477,33.8
7,7,0.843425,0.0448,0.947826,0.019659,0.891648,0.023096,69.0
8,8,0.84437,0.054847,0.703448,0.094056,0.764225,0.06701,29.0
9,9,0.728689,0.060502,0.910483,0.040253,0.808229,0.045174,51.4
