In [18]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from habr_article_analyzer.data import DEFAULT_FULL_PATH, download_dataset
from habr_article_analyzer.data_loader import HabrDataset
from habr_article_analyzer.data_utils.split_dataset import split_dataset
from habr_article_analyzer.datasets.lazy_ranking import (
    FullLazyRankingDataset,
    SamplingLazyRankingDataset,
)
from habr_article_analyzer.metrics.ranking_metrics import RankingMetrics
from habr_article_analyzer.models.encoders.hub_averaging_encoder import HubEncoder
from habr_article_analyzer.models.encoders.tf_idf_encoder import TextEncoder
from habr_article_analyzer.models.predictors.ranking_nn import RankingModel
from habr_article_analyzer.settings import data_settings, settings
from habr_article_analyzer.targets import Target
from habr_article_analyzer.trainers.classification_trainer import Trainer

# Baseline

*Author: Sergiy Iarygin*

The goal of this notebook is to prepare the baseline. Our task: predict the probability that given hub is a match for a given text. This is how the model will work:

1. We take two inputs: text and hub, which we need to map to some vectors. Model `A`, based on bag of words (Tf-Idf bag of words), will map text to some vector in $R^n$.
2. Then for each matched pair (text_i, hub_j) in dataset we aggregate `avg(A(text_i))` as an embedding of hub. Now `hub_j` embedding is ` avg(A(text_i))`.
3. Model `C` estimates the probability based on this two vectors: embed of text and embed of hub.

As model `C` we will use classic DL with couple stacked layers.

In [2]:
download_dataset(DEFAULT_FULL_PATH)

In [None]:
split_dataset()

In [41]:
train_df = HabrDataset(
    path=settings.raw_data_dir / "train.jsonl.zst",
    batch_size=data_settings.batch_size,
).get_dataframe()
val_df = HabrDataset(
    path=settings.raw_data_dir / "val.jsonl.zst",
    batch_size=data_settings.batch_size,
).get_dataframe()
test_df = HabrDataset(
    path=settings.raw_data_dir / "test.jsonl.zst",
    batch_size=data_settings.batch_size,
).get_dataframe()

Reading dataset: 181226it [00:16, 10748.76it/s]
Reading dataset: 60409it [00:05, 11276.74it/s]
Reading dataset: 60408it [00:04, 13219.92it/s]


In [4]:
train_target = Target(train_df, "hubs", sparse=True)
val_target = Target(val_df, "hubs", sparse=True)

In [None]:
text_encoder = TextEncoder()
text_encoder.fit(train_df["text_markdown"])
text_encoder.save(settings.models_dir / "text_encoder.pt")



In [7]:
text_embeddings_train = text_encoder.transform(train_df["text_markdown"])
text_embeddings_val = text_encoder.transform(val_df["text_markdown"])

In [8]:
hub_encoder = HubEncoder()
hub_encoder.fit(train_target, text_embeddings_train)
hub_encoder.save(settings.models_dir / "hub_encoder.pt")

In [9]:
hub_embeddings_train = hub_encoder.transform(train_target.labels)
hub_embeddings_val = hub_encoder.transform(val_target.labels)

In [10]:
sampled_dataset_train = SamplingLazyRankingDataset(
    text_embeddings_train,
    hub_embeddings_train,
    train_target,
    1,
    3,
    "label_proportional",
)
sampled_dataset_val = SamplingLazyRankingDataset(
    text_embeddings_val, hub_embeddings_val, val_target, 1, 3, "label_proportional"
)

In [11]:
model = RankingModel(input_dim=10000, hidden_dims=[512, 256, 128])

trainer = Trainer(
    model,
    device="cuda",
    log_dir=settings.raw_data_dir / "runs/experiment_2",
    train_dir=settings.raw_data_dir / "train",
)

TensorBoard logs will be saved to: /home/sergiy/habr-article-analyzer/data/raw/runs/experiment_2
Run: tensorboard --logdir=/home/sergiy/habr-article-analyzer/data/raw/runs/experiment_2


In [13]:
history = trainer.fit(
    train_loader=DataLoader(
        sampled_dataset_train, batch_size=256, shuffle=True, drop_last=True
    ),
    val_loader=DataLoader(
        sampled_dataset_val, batch_size=256, shuffle=True, drop_last=True
    ),
    epochs=10,
    lr=0.001,
    weight_decay=0.0001,
    patience=2,
)

TRAINING START


Epoch 1 [Train]: 100%|██████████| 2831/2831 [05:11<00:00,  9.09it/s, loss=0.2386]
Epoch 1 [Val]: 100%|██████████| 943/943 [00:53<00:00, 17.77it/s]


Epoch 1/10 Summary (Time: 365.6s)
Train Loss: 0.2826 | Train AUC: 0.9308
Val Loss:   0.2787 | Val AUC:   0.9328 |Val PR-AUC: {val_pr_auc:.4f}
Learning Rate: 0.001000
Best model saved. (AUC: 0.9328)


Epoch 2 [Train]: 100%|██████████| 2831/2831 [05:08<00:00,  9.19it/s, loss=0.2834]
Epoch 2 [Val]: 100%|██████████| 943/943 [00:51<00:00, 18.17it/s]


Epoch 2/10 Summary (Time: 361.2s)
Train Loss: 0.2665 | Train AUC: 0.9390
Val Loss:   0.2721 | Val AUC:   0.9362 |Val PR-AUC: {val_pr_auc:.4f}
Learning Rate: 0.001000
Best model saved. (AUC: 0.9362)


Epoch 3 [Train]: 100%|██████████| 2831/2831 [05:07<00:00,  9.20it/s, loss=0.2561]
Epoch 3 [Val]: 100%|██████████| 943/943 [00:52<00:00, 17.89it/s]


Epoch 3/10 Summary (Time: 361.5s)
Train Loss: 0.2533 | Train AUC: 0.9452
Val Loss:   0.2706 | Val AUC:   0.9373 |Val PR-AUC: {val_pr_auc:.4f}
Learning Rate: 0.001000
Best model saved. (AUC: 0.9373)


Epoch 4 [Train]: 100%|██████████| 2831/2831 [05:08<00:00,  9.18it/s, loss=0.2244]
Epoch 4 [Val]: 100%|██████████| 943/943 [00:52<00:00, 18.06it/s]


Epoch 4/10 Summary (Time: 361.8s)
Train Loss: 0.2437 | Train AUC: 0.9495
Val Loss:   0.2680 | Val AUC:   0.9382 |Val PR-AUC: {val_pr_auc:.4f}
Learning Rate: 0.001000
Best model saved. (AUC: 0.9382)


Epoch 5 [Train]: 100%|██████████| 2831/2831 [05:08<00:00,  9.18it/s, loss=0.2664]
Epoch 5 [Val]: 100%|██████████| 943/943 [00:52<00:00, 17.92it/s]


Epoch 5/10 Summary (Time: 362.1s)
Train Loss: 0.2357 | Train AUC: 0.9529
Val Loss:   0.2675 | Val AUC:   0.9388 |Val PR-AUC: {val_pr_auc:.4f}
Learning Rate: 0.001000
Best model saved. (AUC: 0.9388)


Epoch 6 [Train]: 100%|██████████| 2831/2831 [05:07<00:00,  9.20it/s, loss=0.2420]
Epoch 6 [Val]: 100%|██████████| 943/943 [00:51<00:00, 18.22it/s]


Epoch 6/10 Summary (Time: 360.4s)
Train Loss: 0.2302 | Train AUC: 0.9552
Val Loss:   0.2686 | Val AUC:   0.9382 |Val PR-AUC: {val_pr_auc:.4f}
Learning Rate: 0.001000
No improvement for 1 epochs


Epoch 7 [Train]: 100%|██████████| 2831/2831 [05:09<00:00,  9.15it/s, loss=0.2559]
Epoch 7 [Val]: 100%|██████████| 943/943 [00:52<00:00, 17.83it/s]


Epoch 7/10 Summary (Time: 363.5s)
Train Loss: 0.2252 | Train AUC: 0.9572
Val Loss:   0.2689 | Val AUC:   0.9386 |Val PR-AUC: {val_pr_auc:.4f}
Learning Rate: 0.001000
No improvement for 2 epochs

Early stopping triggered after 7 epochswith no improvement
TRAINING COMPLETE
Total time: 42.3 minutes
Best Val AUC: 0.9388
View results: tensorboard --logdir=/home/sergiy/habr-article-analyzer/data/raw/runs/experiment_2


AUC looks impressive, while it does not guarantee anything on ranking task.

In [4]:
model = RankingModel(input_dim=10000)

In [5]:
model.load_state_dict(torch.load("best_model.pt"))
torch.save(model.state_dict(), settings.models_dir / "bow_dssm_1.pt")
model.to("cuda")

RankingModel(
  (model): Sequential(
    (0): Linear(in_features=10000, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.1, inplace=False)
    (8): Linear(in_features=256, out_features=128, bias=True)
    (9): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.1, inplace=False)
    (12): Linear(in_features=128, out_features=1, bias=True)
    (13): Sigmoid()
  )
)

In [83]:
text_encoder = TextEncoder(max_features=5000).load(
    settings.models_dir / "text_encoder.pt"
)
hub_encoder = HubEncoder(dim=5000).load(settings.models_dir / "hub_encoder.pt")

In [84]:
test_df = (
    HabrDataset(
        path=settings.raw_data_dir / "test.jsonl.zst",
        batch_size=data_settings.batch_size,
    )
    .get_dataframe()
    .sample(500)
)

Reading dataset: 60408it [00:04, 13633.68it/s]


In [85]:
test_df.head()

Unnamed: 0,id,text_markdown,hubs
45884,285760,Я руководитель небольшой дизайн-студии. Расска...,"[sales, business-laws]"
60248,364979,"> «А тот, который во мне сидит,\n> Считает, ч...","[gadgets, popular_science, multicopters, games]"
49321,201448,"На днях стало известно о том, что компания ID ...",[linux]
54135,116290,Сообщество жизненно важно любому проекту Open ...,[open_source]
17629,668160,"Привет, Хабр! Меня зовут Сергей Петровский, я ...","[sberbank, it_testing, dwh]"


In [86]:
text_embeddings_test = text_encoder.transform(test_df["text_markdown"])
test_target = Target(test_df, "hubs", sparse=True)
hub_embeddings_test = hub_encoder.transform(test_target.labels)



In [87]:
full_dataset_test = FullLazyRankingDataset(
    text_embeddings_test, hub_embeddings_test, test_target
)

In [88]:
predicts = []
loader = DataLoader(full_dataset_test, batch_size=256, shuffle=False)


with torch.no_grad():
    for features, labels in tqdm(loader, desc="applying model to test dataset"):
        gpu_features = features.to("cuda")
        gpu_predicts = model(gpu_features)
        for predict in gpu_predicts.to("cpu"):
            predicts.append(predict.item())

applying model to test dataset: 100%|██████████| 674/674 [00:25<00:00, 26.47it/s]


In [89]:
predicts = np.reshape(
    np.array(predicts), shape=(len(text_embeddings_test), len(hub_embeddings_test))
)

Let's see the predicted ranking:

In [92]:
for j, sorted_targets in enumerate(predicts.argsort(axis=1)[:, ::-1]):
    print(test_df.iloc[j]["text_markdown"][:100])
    print(test_df.iloc[j]["hubs"])
    print([test_target.labels[i] for i in sorted_targets[:10]])

Я руководитель небольшой дизайн-студии. Расскажу о некоторых признаках договорных тендеров.
К сожале
['sales', 'business-laws']
['saas', 'nanosoft', 'desktop_environment', 'i_am_advertising', 'google', 'scrumtrek', 'epam_systems', 'kelnik', 'cad_cam', 'itsumma']
> «А тот, который во мне сидит,
>  Считает, что он — истребитель.» (с) Высоцкий
>
Подарок защитникам
['gadgets', 'popular_science', 'multicopters', 'games']
['google', 'desktop_environment', 'dell_technologies', 'paysto', 'google_chrome', 'tuturu', 'service_desk', 'telebreeze', 'bigdata', 'itsumma']
На днях стало известно о том, что компания ID Software отказывается от поддержки Mac OS X и Linux дл
['linux']
['google', 'i_am_advertising', 'erp', 'css', 'kelnik', 'browsers', 'code_wtf', 'desktop_environment', 'crazydev', 'nanosoft']
Сообщество жизненно важно любому проекту Open Source. Активное и живое сообщество явлется сердцем пр
['open_source']
['unity', 'nanosoft', 'desktop_environment', 'epam_systems', 'uprock', 'erp', 'i_a

Doesn't look that great.

In [None]:
metrics = RankingMetrics(test_target, predicts)

In [95]:
print(f"Average dcg of model on test dataset: {np.mean(metrics.dcg()).item():.4f}")
print(f"Average ndcg of model on test dataset: {np.mean(metrics.ndcg()).item():.4f}")

Average dcg of model on test dataset: 0.1053
Average ndcg of model on test dataset: 0.0686


Нам явно есть куда расти! (у ndcg максимум = 1)

In [96]:
best_5 = np.argsort(metrics.ndcg())[-5:]
worst_5 = np.argsort(metrics.ndcg())[:5]
print(best_5, worst_5)

[ 67 161 310 453 436] [384 149 494 232 158]


In [98]:
def print_sample(
    id: int,
    target: Target,
    metric: RankingMetrics,
    df: pd.DataFrame,
    predicts: np.ndarray,
):
    sample_dcg = metric.dcg()[id]
    sample_ndcg = metric.ndcg()[id]
    text = df.iloc[id]["text_markdown"]
    targets = df.iloc[id]["hubs"]
    predicted_ranking = [target.labels[i] for i in predicts[id].argsort()[::-1]]

    target_predicts = [
        predicts[id][target.label_to_id(t_item)].item() for t_item in targets
    ]
    top_predicts = [
        predicts[id][target.label_to_id(item)].item() for item in predicted_ranking
    ]

    print(f"id: {id}")
    print(f"dcg: {sample_dcg}")
    print(f"ndcg: {sample_ndcg}")
    print(f"text:\n {text[:100]}")
    print(f"...")
    print(f"{text[-100:]}")
    print(f"targets: {list(zip(targets, target_predicts))}")
    print(f"predict: {list(zip(predicted_ranking[:20], top_predicts[:20]))}")

In [99]:
for id in worst_5:
    print_sample(id, test_target, metrics, test_df, predicts)

id: 384
dcg: 0.0029940119760479044
ndcg: 0.0029940119760479044
text:
 Привет Хабр! А вот и снова мы! На перекор множествам скептиков, которые нередко встречались на нашем
...
Следующим шагом будет объединение задач распознавания и управления манипулятором на реальном роботе.
targets: [('tod', 0.0004877964674960822)]
predict: [('nanosoft', 0.9293713569641113), ('cad_cam', 0.9181030988693237), ('dcmiran', 0.9159507751464844), ('comptek', 0.9130692481994629), ('fujitsu', 0.8883840441703796), ('dell_technologies', 0.8808190822601318), ('vector_graphics', 0.8611271977424622), ('dwh', 0.8495913147926331), ('galtsystems', 0.8257451057434082), ('stc_spb', 0.798514723777771), ('croc', 0.7880276441574097), ('geo', 0.771212637424469), ('engineering_systems', 0.7657341361045837), ('machine_learning', 0.7610746622085571), ('first', 0.7586255073547363), ('i_am_advertising', 0.7534132599830627), ('industrial_control_system', 0.7369108200073242), ('dataline', 0.7349568605422974), ('1cloud', 0.72746449

You can see that some rare hubs are most problematic.

In [100]:
for id in best_5:
    print_sample(id, test_target, metrics, test_df, predicts)

id: 67
dcg: 1.75
ndcg: 0.9545454545454546
text:
 Перевод статьи «Building robust web apps with React: Part 1, in-browser prototypes», Matt Hinchliffe
...
тимизации кода для браузера. Пожалуйста, комментируйте или твитните мне, я буду рад получить отзывы.
targets: [('webdev', 0.9758679866790771), ('javascript', 0.9567009210586548), ('reactjs', 0.9893564581871033)]
predict: [('reactjs', 0.9893564581871033), ('webdev', 0.9758679866790771), ('ui', 0.9587210416793823), ('javascript', 0.9567009210586548), ('nanosoft', 0.9445443749427795), ('usability', 0.9340246319770813), ('crazydev', 0.9211539030075073), ('localization', 0.9194872379302979), ('controllers', 0.9164429903030396), ('go', 0.910089910030365), ('technical_writing', 0.9059873819351196), ('vuejs', 0.8980258703231812), ('debug', 0.8955955505371094), ('erlang', 0.8737245202064514), ('programming', 0.8720690011978149), ('history', 0.8511224389076233), ('angular', 0.8485491275787354), ('paysto', 0.8331936001777649), ('otus', 0.8306916

While the models is great on most popular labels.

I think next steps are:
- using ranking losses in training
- using better embeddings 
- generating more balanced dataset