In [2]:
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from scipy import sparse
from sklearn.linear_model import LogisticRegression
import random

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
import pandas as pd
import torch
from tqdm.contrib import tzip

from joblib import Parallel, delayed
from sklearn.metrics import f1_score, hamming_loss

In [4]:
df_train = pd.read_pickle("..\\data\\processed\\df_train.pkl")
df_test = pd.read_pickle("..\\data\\processed\\df_test.pkl")

text_embeddings_train = torch.load("..\\data\\processed\\rocov2_captions_embeddings_train.pt")
text_embeddings_test = torch.load("..\\data\\processed\\rocov2_captions_embeddings_test.pt")

image_embeddings_train = torch.load("..\\data\\processed\\rocov2_image_embeddings_train.pt")
image_embeddings_test = torch.load("..\\data\\processed\\rocov2_image_embeddings_test.pt")

# optiimzation for better memory management
text_embeddings_train = text_embeddings_train.detach().cpu().numpy().astype("float32")
text_embeddings_test = text_embeddings_test.detach().cpu().numpy().astype("float32")

image_embeddings_train = image_embeddings_train.detach().cpu().numpy().astype("float32")
image_embeddings_test = image_embeddings_test.detach().cpu().numpy().astype("float32")


# get merged embeddings
combined_embeddings_train = np.concatenate((text_embeddings_train, image_embeddings_train), axis=1)
combined_embeddings_test = np.concatenate((text_embeddings_test, image_embeddings_test), axis=1)

combined_embeddings_train = combined_embeddings_train.astype("float32")
combined_embeddings_test = combined_embeddings_test.astype("float32")

In [5]:
# initialize pca and mlb
pca = PCA(n_components=300, random_state=42)
mlb = MultiLabelBinarizer()

In [6]:
# Binarize labels
y_train = mlb.fit_transform(df_train['Semantic_vec'])
y_test = mlb.transform(df_test['Semantic_vec'])

In [7]:
# PCA for dimensionality reduction
X_train = pca.fit_transform(combined_embeddings_train)
X_test = pca.transform(combined_embeddings_test)

# Ensemble of classifier chains with random undersampling
This technique is based off this paper: https://arxiv.org/pdf/1807.11393 <br>
The algorithm shown was then optimized with parallelization of the ClassifierChains and batch testing of the test data.

In [16]:
def train_CCRU(X, y, CH):
    X = np.array(X)
    y = np.array(y)
    
    Xj = X.copy()
    h = {}
    
    for j_idx, j in enumerate(CH):  # Use j_idx for iteration, j for actual label index
        
        rus = RandomUnderSampler(random_state=42)
        X_us, y_us = rus.fit_resample(Xj, y[:, j])  # Use j (actual label index)
        
        
        lr = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
        lr.fit(X_us, y_us)  
        h[j] = lr
        
        # Add true label as feature for next classifier (except for the last one)
        if j_idx < len(CH) - 1:  # Use j_idx for comparison
            y_true = y[:, j]  # Use j (actual label index) to get the true labels
            Xj = sparse.hstack([X_us, y_true.reshape(-1, 1)]).tocsr()
    
    return h

In [17]:
def sample_with_replacement(X,y):
    sample_size = len(X)

    sampled_indices = random.choices(range(sample_size), k=sample_size)
    
    X_np = np.array(X)
    y_np = np.array(y)

    X_prime = X_np[sampled_indices]
    y_prime = y_np[sampled_indices]

    return X_prime, y_prime


def get_minority(y, pos):
    if not isinstance(y, np.ndarray):
        y = np.array(y)
    
    class_counts = np.bincount(y[:, pos])
    return np.min(class_counts)

def get_cj(y, c, j, q):
    # Ensure y is a numpy array
    if not isinstance(y, np.ndarray):
        y = np.array(y)
    
    # Calculate the sum of minority counts for all labels
    sum_of_minorities = 0
    for k in range(q):
        counts = np.bincount(y[:, k])
        sum_of_minorities += np.min(counts[np.nonzero(counts)])

    # Calculate the minority count for the specific label j
    counts_j = np.bincount(y[:, j])
    minority_j = np.min(counts_j[np.nonzero(counts_j)])
    
    # Avoid division by zero
    if minority_j == 0:
        return 0.0
        
    return (c * sum_of_minorities) / (q * minority_j)



def train_ECCRU_optimized(X, y, labels_num, standard_chains_num, coefficient):
    # 1. Pre-generate all chain permutations and bootstrap indices.
    #    This is done once, upfront.

    c_omega_max = int(standard_chains_num * coefficient)

    classifiers_for_each_label = [min(get_cj(y, standard_chains_num, j, labels_num), c_omega_max) for j in range(labels_num)]

    permutations = []
    for i in range(c_omega_max):
        S = [j for j in range(labels_num) if classifiers_for_each_label[j] > 0]
        if len(S) < 2:
            break
        CH = np.random.permutation(S).tolist()
        permutations.append(CH)
        for j in S:
            classifiers_for_each_label[j] -= 1

    # 2. Define a training function that takes the full dataset and bootstrap indices.
    #    This allows the function to be called in parallel.
    def train_single_chain_optimized(X_full, y_full, sampled_indices, CH):
        X_sampled = X_full[sampled_indices]
        y_sampled = y_full[sampled_indices]
        return train_CCRU(X=X_sampled, y=y_sampled,CH=CH)

    # 3. Use Parallel to execute the training. The large arrays (X_full, y_full)
    #    are passed to each process, but Python's multiprocessing will often
    #    use shared memory for this, which is more efficient. The key is to
    #    pass only the indices to `train_single_chain_optimized`.

    # Pre-generate bootstrap indices for each chain
    sample_size = len(X)
    bootstrap_indices = [np.random.choice(sample_size, size=sample_size, replace=True) for _ in range(len(permutations))]

    print(f"Numbers of classifier chains: {len(permutations)}")

    H = Parallel(n_jobs=14, verbose=10)(
        delayed(train_single_chain_optimized)(X, y, indices, CH)
        for indices, CH in tzip(bootstrap_indices, permutations)
    )

    counter = len(H)
    return H, permutations, counter      

In [10]:
def test_ECCRU_optimized(X, labels_num, eccru, permutations, cc_num=None, threshold=0.5):
    X_np = np.array(X)
    
    # Sanity Check: Confirm the input is a 2D array
    if X_np.ndim != 2:
        raise ValueError(f"Input to test_ECCRU_optimized must be a 2D array, but got shape {X_np.shape}")
    
    num_samples = X_np.shape[0]
    
    sum_of_probabilities_for_batch = np.zeros((num_samples, labels_num), dtype=float)
    label_prediction_counts = np.zeros(labels_num, dtype=int)

    num_chains_to_use = len(eccru) if cc_num is None else cc_num
    if num_chains_to_use == 0:
        return np.zeros((num_samples, labels_num), dtype=int)

    for i in range(num_chains_to_use):
        if i >= len(eccru):
            break

        chain_h_dict = eccru[i]
        CH_sequence = permutations[i]
        current_X_for_chain = X_np.copy()

        for j_idx_in_chain in range(len(CH_sequence)):
            actual_label_index = CH_sequence[j_idx_in_chain]  # This is the actual label index
            
            # Use j_idx_in_chain to get the correct model from the chain
            model_for_label = chain_h_dict[j_idx_in_chain]
            
            predicted_probs_for_label_all_samples = model_for_label.predict_proba(current_X_for_chain)[:, 1]

            # Use actual_label_index to accumulate predictions for the correct label
            sum_of_probabilities_for_batch[:, actual_label_index] += predicted_probs_for_label_all_samples
            label_prediction_counts[actual_label_index] += 1
            
            binary_predictions_for_label_all_samples = (predicted_probs_for_label_all_samples >= 0.5).astype(int)

            # Add predictions as features for next classifier (except for the last one)
            if j_idx_in_chain < len(CH_sequence) - 1:
                current_X_for_chain = np.hstack([current_X_for_chain, binary_predictions_for_label_all_samples.reshape(-1, 1)])

    final_relevance_degrees_batch = np.divide(
        sum_of_probabilities_for_batch,
        label_prediction_counts,
        out=np.zeros_like(sum_of_probabilities_for_batch, dtype=float),
        where=label_prediction_counts != 0
    )

    final_binary_predictions = (final_relevance_degrees_batch >= threshold).astype(int)
    return final_binary_predictions

def test_on_chunk(X_chunk, labels_num, eccru, permutations, cc_num):
    """
    Wrapper to ensure the correct arguments are passed.
    """
    return test_ECCRU_optimized(X_chunk, labels_num, eccru, permutations, cc_num)


In [11]:
def create_balanced_chunks(X, n_jobs):
    """
    Crea chunks bilanciati che non perdono campioni
    """
    n_samples = len(X)
    chunk_size = n_samples // n_jobs
    chunks = []
    
    start_idx = 0
    for i in range(n_jobs):
        if i == n_jobs - 1:  # Ultimo chunk prende tutti i campioni rimanenti
            end_idx = n_samples
        else:
            end_idx = start_idx + chunk_size
        
        chunks.append(X[start_idx:end_idx])
        start_idx = end_idx
    
    return chunks

def safe_parallel_processing(X_test, labels_num, eccru, permutations, counter, n_jobs=None):
    """
    Processing parallelo che garantisce di non perdere campioni
    """
    if n_jobs is None:
        n_jobs = os.cpu_count()
    
    n_samples = len(X_test)
    print(f"Campioni totali da processare: {n_samples}")
    
    # Crea chunks bilanciati
    chunks = create_balanced_chunks(X_test, n_jobs)
    
    print(f"Creati {len(chunks)} chunks con dimensioni: {[len(c) for c in chunks]}")
    
    # Processing parallelo
    y_preds_chunks = Parallel(n_jobs=n_jobs, backend="loky", verbose=1)(
        delayed(test_on_chunk)(chunk, labels_num, eccru, permutations, counter)
        for chunk in chunks
    )
    
    # Concatena i risultati
    y_preds = np.concatenate(y_preds_chunks, axis=0)
    
    print(f"Risultato finale: {y_preds.shape}")
    print(f"Match con input: {len(y_preds) == n_samples}")
    
    return y_preds

def fix_existing_predictions(y_preds, target_length):
    """
    Corregge predizioni esistenti per matchare la lunghezza target
    """
    current_length = len(y_preds)
    print(f"Lunghezza attuale predizioni: {current_length}")
    print(f"Lunghezza target: {target_length}")
    
    if current_length > target_length:
        print(f"Troncando {current_length - target_length} campioni extra")
        return y_preds[:target_length]
    elif current_length < target_length:
        print(f"ERRORE: Mancano {target_length - current_length} campioni!")
        print("Dovrai rifare il processing.")
        return None
    else:
        print("Le dimensioni sono già corrette")
        return y_preds

def complete_eccru_evaluation(X_test, y_test: None, df_test: None, mlb, eccru, permutations, counter):
    """
    Valutazione completa ECCRU con controlli di sicurezza
    """
    print("=== INIZIO VALUTAZIONE ECCRU ===")
    
    # 1. Verifica dimensioni iniziali
    # print(f"X_test shape: {X_test.shape}")
    # print(f"df_test shape: {df_test.shape}")
    
    # 2. Crea y_test di riferimento
    if y_test is None:
        if df_test is not None and mlb is not None:
            y_test = mlb.transform(df_test["Semantic_vec"])
            print(f"y_test shape: {y_test.shape}")
        else:
            return None
    
    # 3. Processing sicuro
    y_preds = safe_parallel_processing(X_test, 16, eccru, permutations, counter)
    
    # 4. Verifica finale delle dimensioni
    if len(y_preds) != len(y_test):
        print(f"ERRORE: Mismatch dimensioni!")
        print(f"y_preds: {len(y_preds)}, y_test: {len(y_test)}")
        
        # Tenta correzione
        y_preds = fix_existing_predictions(y_preds, len(y_test))
        if y_preds is None:
            return None
    
    # 5. Calcola metriche
    y_preds_binary = (np.array(y_preds) >= 0.5).astype(int)
    
    # f1_micro = f1_score(y_test, y_preds_binary, average='micro')
    # f1_macro = f1_score(y_test, y_preds_binary, average='macro')
    # f1_samples = f1_score(y_test, y_preds_binary, average="samples")
    # hamming = hamming_loss(y_test, y_preds_binary)
    
    # 6. Risultati
    # print("\n=== RISULTATI FINALI ===")
    # print(f"F1 micro: {f1_micro:.4f}")
    # print(f"F1 macro: {f1_macro:.4f}")
    # print(f"F1 samples: {f1_samples:.4f}")
    # print(f"Hamming loss: {hamming:.4f}")
    
    # return {
    #     'y_preds': y_preds_binary,
    #     'y_test': y_test,
    #     'metrics': {
    #         'f1_micro': f1_micro,
    #         'f1_macro': f1_macro, 
    #         'f1_samples': f1_samples,
    #         'hamming_loss': hamming
    #     }
    # }
    return y_preds_binary

In [None]:
ecc, permutations, counter = train_ECCRU_optimized(X_train,y_train, 16, 10, 10)
results = complete_eccru_evaluation(X_test, y_test=y_test, df_test=None, mlb=None, eccru=ecc, permutations=permutations, counter=counter)

print(f"f1-micro: {f1_score(y_test,results, average="micro")}\nf1-macro: {f1_score(y_test,results, average="macro")}\nf1-samples: {f1_score(y_test,results, average="samples")}\nhamming: {hamming_loss(y_test,results)}")

Numbers of classifier chains: 100


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.


  0%|          | 0/100 [00:00<?, ?it/s]

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 59898, expected 250.

: 