In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
import datasets

In [2]:
dataset = datasets.load_from_disk('../../data/preprocessed')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 340675
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 272541
    })
})

In [3]:
train = dataset['train'].select_columns(['text', 'labels']).to_pandas()
test = dataset['test'].select_columns(['text', 'labels']).to_pandas()
train

Unnamed: 0,text,labels
0,Stability of holonomicity over quasi-projectiv...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Construction of Lumps with nontrivial interact...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Complex Recurrent Spectral Network\n\n This p...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Soft X-ray to Far Infrared luminosities ratio ...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
4,An improved Material Mask Overlay Strategy for...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
340670,Magnetic translation groups in an n-dimensiona...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
340671,Cosmic Web Dissection in Fuzzy Dark Matter Cos...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
340672,Reduced class groups grafting relative invaria...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
340673,An Algorithmic Approach to the Asynchronous Co...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [4]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
train_embeddings = encoder.encode(train['text'].tolist(), batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/10647 [00:00<?, ?it/s]

In [6]:
category_embeddings = np.zeros((38, train_embeddings.shape[1]))
category_counts = np.zeros(38)

In [7]:
for idx, labels in enumerate(train['labels']):
    for cat_id, present in enumerate(labels):
        if present:
            category_embeddings[cat_id] += train_embeddings[idx]
            category_counts[cat_id] += 1

In [8]:
# Normalize (mean embedding)
for i in range(38):
    if category_counts[i] > 0:
        category_embeddings[i] /= category_counts[i]

In [9]:
test_embeddings = encoder.encode(test['text'].tolist(), batch_size=128, show_progress_bar=True)
similarities = cosine_similarity(test_embeddings, category_embeddings)  # shape: [num_samples, num_categories]

Batches:   0%|          | 0/2130 [00:00<?, ?it/s]

In [10]:
top_k = 3  # top sim categories

preds = np.zeros_like(similarities)
for i in range(similarities.shape[0]):
    topk_indices = np.argsort(similarities[i])[::-1][:top_k]
    preds[i, topk_indices] = 1  
    
print(classification_report(test['labels'].tolist(), preds))

              precision    recall  f1-score   support

           0       0.00      0.50      0.00         4
           1       0.00      0.71      0.00        34
           2       0.01      0.91      0.01       144
           3       0.00      1.00      0.00         3
           4       0.86      0.90      0.88     35222
           5       0.00      0.78      0.00         9
           6       0.00      0.50      0.00         2
           7       0.02      0.78      0.03       255
           8       0.00      0.76      0.00        29
           9       0.00      1.00      0.01        87
          10       0.00      0.86      0.00        21
          11       0.77      0.74      0.76     35981
          12       0.80      0.87      0.83     62467
          13       0.00      0.93      0.01        67
          14       0.06      0.97      0.11       904
          15       0.10      0.90      0.18      6052
          16       0.00      0.64      0.00        45
          17       0.34    