# ACM Abstract Topic Classifier
MiniLM embedding + LogisticRegression (One‑vs‑Rest)

In [7]:
# --- Imports & environment ---
import os, pickle, joblib, subprocess
from pathlib import Path

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

import torch
from sentence_transformers import SentenceTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.metrics import (precision_score, recall_score,
                             f1_score, accuracy_score, hamming_loss)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on:", device)

nltk.download('stopwords')
stop_words = stopwords.words('english')

Running on: cuda


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/konstanty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# --- Helper Functions ---
def flatten_if_single(x):
    """If x is a list of length 1, return its first element."""
    if isinstance(x, list) and len(x) == 1:
        return x[0]
    return x

def load_or_train_mlb(train_labels, path=Path('models/LDA/mlb_model.pkl'), all_labels=None):
    if path.exists():
        print('✓ MLB loaded')
        return joblib.load(path)
    print('… training MultiLabelBinarizer')
    if all_labels is None:
        all_labels = sorted({lbl for sub in train_labels for lbl in sub})
    mlb = MultiLabelBinarizer(classes=all_labels)
    mlb.fit(train_labels)
    joblib.dump(mlb, path)
    return mlb

def notify():
    try:
        subprocess.run(['play', '-nq', '-t', 'alsa', 'synth', '0.3', 'sine', '1000'])
    except FileNotFoundError:
        pass


In [9]:
# --- Load data ---
notebook_dir = os.getcwd()
test_path  = os.path.join(notebook_dir, 'data', 'DM2023_test_docs.tsv')
train_path = os.path.join(notebook_dir, 'data', 'DM2023_training_docs_and_labels.tsv')

test = pd.read_csv(test_path,  sep='\t', encoding='latin1',
                   header=None, names=['Textfile','Text','Topics'])

train_full = pd.read_csv(train_path, sep='\t', encoding='latin1',
                         header=None, names=['Textfile','Text','Topics'])

# Split topics string into list
train_full['Topics'] = (train_full['Topics']
                        .apply(flatten_if_single)
                        .str.split(r'\s*,\s*'))


In [10]:
# --- Train / Validation split ---
split = int(train_full.shape[0] * 0.8)
val   = train_full.iloc[split:].reset_index(drop=True)
train = train_full.iloc[:split].reset_index(drop=True)
print('Train:', train.shape, ' Val:', val.shape)

Train: (80000, 3)  Val: (20000, 3)


In [11]:
# --- Vectorizer ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Teksty
train_texts = train["Text"].tolist()
val_texts   = val["Text"].tolist()
test_texts  = test["Text"].tolist()

# Wektoryzacja

vec_path = Path('models/LDA/vectorizer.pkl')

if vec_path.exists():
    print('\n\tFound vectorizer model!')
    with open(vec_path, 'rb') as f:
        vectorizer = joblib.load(f)
    X_train_bow = vectorizer.transform(train_texts)

else:
    vectorizer = TfidfVectorizer(max_features=5000, stop_words=stop_words)
    X_train_bow = vectorizer.fit_transform(train_texts)


X_val_bow   = vectorizer.transform(val_texts)
X_test_bow  = vectorizer.transform(test_texts)

In [12]:
# --- LDA ---
vec_path = Path('models/LDA/lda.pkl')

if vec_path.exists():
    print('\n\tFound LDA model!')
    with open(vec_path, 'rb') as f:
        lda = joblib.load(f)
else:
    print("\n\tWe need to train LDA first...")
    lda = LatentDirichletAllocation(n_components=20, 
                                    max_iter=10, 
                                    learning_method='batch', 
                                    random_state=42)
    
X_train_lda = lda.fit_transform(X_train_bow)
X_val_lda   = lda.transform(X_val_bow)
X_test_lda  = lda.transform(X_test_bow)

lda_paths = [Path('models/LDA/X_train_lda.npy'),
               Path('models/LDA/X_val_lda.npy'),
               Path('models/LDA/X_test_lda.npy')]

if all(p.exists() for p in lda_paths):
    X_train_lda = np.load(lda_paths[0])
    X_val_lda   = np.load(lda_paths[1])
    X_test_lda  = np.load(lda_paths[2])
else:
    X_train_lda = lda.fit_transform(X_train_bow)
    X_val_lda   = lda.transform(X_val_bow)
    X_test_lda  = lda.transform(X_test_bow)

    np.save(lda_paths[0], X_train_lda)
    np.save(lda_paths[1], X_val_lda)
    np.save(lda_paths[2], X_test_lda)
notify()


	We need to train LDA first...


# Showing to 10 words in topics

In [13]:
n_top_words = 10
feature_names = vectorizer.get_feature_names_out()

print("\nTop słowa dla każdego tematu:\n")
for topic_idx, topic in enumerate(lda.components_):
    top_features_ind = topic.argsort()[::-1][:n_top_words]
    top_words = [feature_names[i] for i in top_features_ind]
    print(f"Temat {topic_idx + 1}: {', '.join(top_words)}")


Top słowa dla każdego tematu:

Temat 1: memory, performance, parallel, code, program, system, time, data, execution, programs
Temat 2: web, search, attacks, pages, user, users, information, detection, attack, page
Temat 3: classification, data, learning, clustering, based, algorithm, method, neural, recognition, feature
Temat 4: data, query, xml, database, queries, security, mining, information, databases, ontology
Temat 5: image, images, 3d, method, motion, based, shape, surface, algorithm, objects
Temat 6: problem, algorithm, graph, graphs, problems, algorithms, linear, polynomial, number, time
Temat 7: game, games, virtual, interaction, user, speech, rfid, haptic, interface, mobile
Temat 8: model, models, time, distribution, stochastic, estimation, distributions, probability, analysis, random
Temat 9: signal, protein, data, gene, noise, frequency, method, channel, signals, algorithm
Temat 10: robot, control, agents, agent, robots, controller, system, fuzzy, systems, learning
Temat 

In [14]:
# --- Label binarization ---
all_topics = sorted({lbl for sub in train_full['Topics'] for lbl in sub})
mlb = load_or_train_mlb(train['Topics'], all_labels=all_topics)

y_train = mlb.transform(train['Topics'])
y_val   = mlb.transform(val['Topics'])

… training MultiLabelBinarizer


In [15]:
# --- Train or load classifier ---
clf_path = Path('models/LDA/classifier_lda.pkl')

if clf_path.exists():
    print('\n\tFound classifier model!')
    with open(clf_path, 'rb') as f:
        clf = joblib.load(f)
else:
    print('\n\tWe need to train classifier first...')
   
    base = LinearSVC(C=1.0, dual=False)     # dual=False szybsze dla n_samples > n_features
    clf = OneVsRestClassifier(base, n_jobs=-1)
    clf.fit(X_train_lda, y_train)
    joblib.dump(clf, clf_path)
    notify()



	We need to train classifier first...


In [16]:
# --- Validation ---

scores = clf.decision_function(X_val_lda)
y_pred_bin = np.zeros_like(scores, dtype=int)

for i, row in enumerate(scores):
    top_idx = row.argmax()
    y_pred_bin[i, top_idx] = 1                 # zawsze co najmniej 1 etykieta
    y_pred_bin[i, row > -0.3] = 1          # + inne, które przekroczą próg


val['PredictedTopics'] = mlb.inverse_transform(y_pred_bin)

y_val_true_bin = mlb.transform(val['Topics'])

print('\n\n=== SAMPLE-BASED METRICS ===')
print('Precision (samples):', precision_score(y_val_true_bin, y_pred_bin,
                                             average='samples', zero_division=0))
print('Recall    (samples):', recall_score(y_val_true_bin, y_pred_bin, average='samples'))
print('F1        (samples):', f1_score(y_val_true_bin, y_pred_bin,
                                       average='samples', zero_division=0))
print('Subset accuracy   :', accuracy_score(y_val_true_bin, y_pred_bin))
print('Hamming loss      :', hamming_loss(y_val_true_bin, y_pred_bin))




=== SAMPLE-BASED METRICS ===
Precision (samples): 0.3018125
Recall    (samples): 0.139052601010101
F1        (samples): 0.17555406204906204
Subset accuracy   : 0.0308
Hamming loss      : 0.008949301675977654


In [17]:
# --- Predict on test & save submission ---

scores = clf.decision_function(X_test_lda)
y_test_bin = np.zeros_like(scores, dtype=int)

for i, row in enumerate(scores):
    top_idx = row.argmax()
    y_test_bin[i, top_idx] = 1                 # zawsze co najmniej 1 etykieta
    y_test_bin[i, row > -0.3] = 1              # + inne, które przekroczą próg

label_lists = mlb.inverse_transform(y_test_bin)

In [18]:
import csv

submission = test
submission["Topics"] = [",".join(labels) for labels in label_lists]

with open("output/submission_lda.txt", "w", encoding="utf-8") as f:
    for labels in label_lists:
        f.write(",".join(labels) + "\n")
