# Introduction

Categorizing documents in **ACM Digital Library** example: 

(Main Class).(Subclass).(Subsubcategory)

**H.3.5**

* H. Information Systems
    *  H.3 Information Storage and Retrieval
        * H.3.5 Online Information Services

**D.3.2**

* D. Software
    * D.3 Programming Languages
        * D.3.2 Language Classifications
  

# Loading data

In [23]:
# import sklearn as sk
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import os.path
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
import os, pickle, joblib
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report, hamming_loss

nltk.download("stopwords")
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/konstanty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
test_path = os.path.join(notebook_dir, "data", "DM2023_test_docs.tsv")
train_path = os.path.join(notebook_dir, "data", "DM2023_training_docs_and_labels.tsv")


test = pd.read_csv(test_path, 
                    sep="\t", 
                    encoding="latin1", 
                    header=None,
                    names=["Textfile", "Text", "Topics"])
# test = test.drop_duplicates()
                    
                    
train_full = pd.read_csv(train_path, 
                    sep="\t", 
                    encoding="latin1", 
                    header=None,
                    names=["Textfile", "Text", "Topics"])


def flatten_if_single(x):
    """Jeśli x jest listą długości 1 – zwróć jej pierwszy element."""
    if isinstance(x, list) and len(x) == 1:
        return x[0]
    return x

# Separating topics
train_full["Topics"] = (
    train_full["Topics"]
    .apply(flatten_if_single)        
    .str.split(r"\s*,\s*")         
)

# train["Topics"] = train["Topics"].str.split(",")

unique_labels = set(label for sublist in train_full["Topics"] for label in sublist)

print(f"Number of unique topics: {len(unique_labels)}")
print("First 10 example topics: ",sorted(list(unique_labels))[:10])

Number of unique topics: 358
First 10 example topics:  ['A.0', 'A.1', 'A.2', 'A.m', 'B.0', 'B.1', 'B.1.0', 'B.1.1', 'B.1.2', 'B.1.3']


# Train LDA, MLB (Or load) and topic distribution 
(Shape of mlb binary matrix should match the number of unique topics)

In [17]:
MODELDIR = Path("models")
MODELDIR.mkdir(exist_ok=True)


def load_or_train_mlb(train_labels, *, path=Path("models/mlb_model.pkl"),
                      all_labels=None):
    if path.exists():
        print("✓ MLB loaded")
        return joblib.load(path)
    print("… training MultiLabelBinarizer")
    if all_labels is None:
        all_labels = sorted({lbl for sub in train_labels for lbl in sub})
    mlb = MultiLabelBinarizer(classes=all_labels)
    mlb.fit(train_labels)
    joblib.dump(mlb, path)
    return mlb

val   = train_full.iloc[80_000:].reset_index(drop=True)
train = train_full.iloc[:80_000].reset_index(drop=True)
print("Train:", train.shape, " Val:", val.shape)

vec_path = Path("models/vectorizer.pkl")
if vec_path.exists():
    print("Vectorizer loaded")
    vectorizer = joblib.load(vec_path)
else:
    print("training Vectorizer...")
    vectorizer = TfidfVectorizer(
        stop_words=stop_words,
        max_df=0.9,
        min_df=3,
        ngram_range=(1, 2),
        sublinear_tf=True,
        max_features=100_000
    )
    vectorizer.fit(train["Text"])
    joblib.dump(vectorizer, "models/vectorizer.pkl")


X_train_topics = vectorizer.transform(train["Text"])
X_val_topics   = vectorizer.transform(val["Text"])
X_test_topics  = vectorizer.transform(test["Text"])

all_topics = sorted({lbl for sub in train_full["Topics"] for lbl in sub})

mlb = load_or_train_mlb(train["Topics"], all_labels=all_topics)

y_train = mlb.transform(train["Topics"])
y_val   = mlb.transform(val["Topics"])

print("Vectorizer vocab size:", len(vectorizer.get_feature_names_out()))
# print("LDA topics:", lda.n_components)
print("y_train shape:", y_train.shape)        # (80000, 358)
print("y_val   shape:", y_val.shape)          # (20000, 358)
print(mlb.classes_) 

Train: (80000, 3)  Val: (20000, 3)
Vectorizer loaded
✓ MLB loaded
Vectorizer vocab size: 467343
y_train shape: (80000, 358)
y_val   shape: (20000, 358)
['A.0' 'A.1' 'A.2' 'A.m' 'B.0' 'B.1' 'B.1.0' 'B.1.1' 'B.1.2' 'B.1.3'
 'B.1.4' 'B.1.5' 'B.1.m' 'B.2' 'B.2.0' 'B.2.1' 'B.2.2' 'B.2.3' 'B.2.4'
 'B.2.m' 'B.3' 'B.3.0' 'B.3.1' 'B.3.2' 'B.3.3' 'B.3.4' 'B.3.m' 'B.4'
 'B.4.0' 'B.4.1' 'B.4.2' 'B.4.3' 'B.4.4' 'B.4.5' 'B.4.m' 'B.5' 'B.5.0'
 'B.5.1' 'B.5.2' 'B.5.3' 'B.5.m' 'B.6' 'B.6.0' 'B.6.1' 'B.6.2' 'B.6.3'
 'B.6.m' 'B.7' 'B.7.0' 'B.7.1' 'B.7.2' 'B.7.3' 'B.7.m' 'B.8' 'B.8.0'
 'B.8.1' 'B.8.2' 'B.8.m' 'B.m' 'C.0' 'C.1' 'C.1.0' 'C.1.1' 'C.1.2' 'C.1.3'
 'C.1.4' 'C.1.m' 'C.2' 'C.2.0' 'C.2.1' 'C.2.2' 'C.2.3' 'C.2.4' 'C.2.5'
 'C.2.6' 'C.2.m' 'C.3' 'C.4' 'C.5' 'C.5.0' 'C.5.1' 'C.5.2' 'C.5.3' 'C.5.4'
 'C.5.5' 'C.5.m' 'C.m' 'D.0' 'D.1' 'D.1.0' 'D.1.1' 'D.1.2' 'D.1.3' 'D.1.4'
 'D.1.5' 'D.1.6' 'D.1.7' 'D.1.m' 'D.2' 'D.2.0' 'D.2.1' 'D.2.10' 'D.2.11'
 'D.2.12' 'D.2.13' 'D.2.2' 'D.2.3' 'D.2.4' 'D.2.5' 'D.2.6' 

# Check 5 topics distributions

In [18]:
# file_ids = train["Textfile"].iloc[:5].values
# topic_distributions = X_train_topics[:5]

# topics_df = pd.DataFrame(np.round(topic_distributions, 3),
#                                    columns=[f"Topic {i}" for i in range(lda.n_components)],
#                                    index=file_ids)
# topics_df.T

# Let's see words assigned to different topics with LDA

In [19]:
# feature_names = vectorizer.get_feature_names_out()

# def show_top_words(model, feature_names, n_top_words=10):
#     for topic_idx, topic in enumerate(model.components_):
#         top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
#         print(f"Topic {topic_idx}: {' '.join(top_features)}")

# show_top_words(lda, feature_names)

# Training classifier

In [20]:
if os.path.exists("models/classifier.pkl"):
    print("Found classifier model!")

    with open("models/classifier.pkl", "rb") as f:
        clf = pickle.load(f)

else:
    print("We need to train classifier first...")

    lr = LinearSVC(C=1.0, dual=False)

    # 3. Predykcja
    clf = OneVsRestClassifier(lr, n_jobs=-1)
    clf.fit(X_train_topics, y_train)


    margins = clf.decision_function(X_val_topics)

    # Przetestuj wiele progów
    best_f1, best_thr = -1, 0
    for thr in np.linspace(-0.5, 1.0, 30):
        y_pred_bin = (margins > thr).astype(int)
        f1 = f1_score(y_val, y_pred_bin, average="samples")
        if f1 > best_f1:
            best_f1, best_thr = f1, thr

    print(f"✅ Najlepszy próg = {best_thr:.3f}, F1 = {best_f1:.4f}")

    with open("models/classifier.pkl", "wb") as f:
        pickle.dump(clf, f)

We need to train classifier first...
✅ Najlepszy próg = -0.500, F1 = 0.4332


# Validation

In [24]:
print("Validation...")
# y_pred_bin = clf.predict(X_val_topics)
y_pred_bin = (clf.decision_function(X_val_topics) > best_thr).astype(int)
y_pred_labels = mlb.inverse_transform(y_pred_bin)

predicted_topics_list = [list(labels) for labels in y_pred_labels]
print(predicted_topics_list[:100])

val["PredictedTopics"] = predicted_topics_list

y_val_true_bin = mlb.transform(val["Topics"])
y_val_pred_bin = mlb.transform(val["PredictedTopics"])


print("micro-F1 :", f1_score(y_val_true_bin, y_val_pred_bin, average="micro"))
print("macro-F1 :", f1_score(y_val_true_bin, y_val_pred_bin, average="macro"))
print("sample-F1 :", f1_score(y_val_true_bin, y_val_pred_bin, average="samples"))
print("Hamming  :", hamming_loss(y_val_true_bin, y_val_pred_bin))

Validation...
[['F.2.2', 'H.2.8', 'I.2.6', 'I.5.2'], ['C.2.1', 'C.2.2', 'C.2.6', 'C.4', 'F.2.2'], ['F.1.1', 'F.1.2', 'F.4.3', 'I.2.8'], ['F.2.2', 'G.1.10', 'G.1.8', 'J.2'], ['F.1.2', 'F.2.2', 'G.3'], ['D.2.4', 'D.2.5'], [], ['F.1.2', 'G.1.0'], ['C.2.2', 'H.4.3', 'H.5.1', 'H.5.2'], ['H.4.2'], ['I.4.6', 'I.4.7', 'I.4.8', 'I.5.2'], ['C.2.0', 'C.2.3', 'C.2.4', 'D.4.6', 'K.6.5'], ['D.2.2', 'D.3.2', 'H.5.4'], ['F.1.1', 'I.2.3', 'I.2.6', 'I.5.1', 'I.6.4', 'I.6.5', 'J.2', 'J.7'], ['K.4.1', 'K.4.3', 'K.7.4'], ['K.3.2'], ['G.1.6', 'I.2.8', 'I.6.8', 'J.1'], ['D.2.5', 'D.2.7', 'F.2.2', 'I.2.8'], ['B.6.1', 'B.6.3', 'B.8.1', 'J.2', 'J.6'], ['C.2.2', 'K.6.5'], ['C.2.4', 'C.4', 'D.4.3', 'D.4.5', 'D.4.7', 'D.4.8'], ['F.2.2', 'G.1.6', 'G.2.2', 'I.2.8'], ['C.2.0', 'C.2.1', 'D.3.2', 'K.6.1', 'K.6.5'], ['H.1.2', 'H.5.2', 'K.6.1'], ['C.2.2', 'C.4', 'D.2.1'], ['G.1.2', 'I.3.5'], ['C.2.1', 'C.2.3', 'C.2.5', 'C.4'], ['C.2.1', 'C.2.4'], ['E.4', 'G.3'], ['D.2.11', 'D.2.6', 'K.6.3'], ['C.2.1', 'C.2.3', 'C.4', 'G.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


macro-F1 : 0.21185145999027402
sample-F1 : 0.43321898878898873
Hamming  : 0.00910963687150838


# Prediction

In [22]:

print("Prediction...")
y_pred_bin = clf.predict(X_test_topics)
y_pred_labels = mlb.inverse_transform(y_pred_bin)

# predicted_topics_str = [",".join(labels) if labels else "-" for labels in y_pred_labels]
predicted_topics_list = [list(labels) for labels in y_pred_labels]
for topics in predicted_topics_list[:50]:
    if topics != []:
        print(topics)

results = pd.DataFrame({
    "Textfile": test["Textfile"].values,
    "Predicted Topics": predicted_topics_list
})


# Making sure this has the same order
order = test["Textfile"]


results_sorted = (
    results.set_index("Textfile")   # <- klucz do dopasowania
           .loc[order]              # <- reindex wg referencyjnej kolejności
           .reset_index()           # <- wróć do zwykłej kolumny
)

# 3. (opcjonalnie) nadpisz `results`
results = results_sorted

print(results.head(15))

Prediction...
['K.6.1']
['H.1.2', 'H.5.1', 'H.5.2']
['G.1.0', 'G.1.7']
['C.2.1', 'C.3']
['K.6.1']
['I.2.7']
['K.6.1']
['I.2.3']
['C.1.2', 'C.4', 'G.2.2']
['G.1.1', 'I.3.5']
['I.2.11', 'I.2.4']
['I.6.5']
['C.2.1']
['K.6.1']
['F.2.2', 'I.2.8']
['B.7.1', 'B.7.2', 'B.8.2']
['F.2.2', 'J.3']
['H.3.5']
['I.3.7']
['H.5.1']
['B.7.1']
['K.3.2']
['H.3.5']
['E.4']
['J.3']
['H.5.2']
['D.1.3']
['K.3.2']
['I.4.8']
       Textfile       Predicted Topics
0    963168.txt                [K.6.1]
1   1811004.txt                     []
2    192631.txt  [H.1.2, H.5.1, H.5.2]
3   1183872.txt                     []
4   1280491.txt         [G.1.0, G.1.7]
5   1059284.txt           [C.2.1, C.3]
6   1133457.txt                [K.6.1]
7   1140350.txt                     []
8    100973.txt                [I.2.7]
9   1147150.txt                     []
10   598535.txt                     []
11  1318072.txt                [K.6.1]
12  1222053.txt                [I.2.3]
13   110442.txt    [C.1.2, C.4, G.2.2]
14  1044226.