In [9]:
import numpy as np
import torch
from habr_article_analyzer.data_loader import HabrDataset
from habr_article_analyzer.datasets.lazy_ranking import FullLazyRankingDataset
from habr_article_analyzer.metrics.ranking_metrics import RankingMetrics
from habr_article_analyzer.models.baseline.baseline import BaselineWord2VecKNN
from habr_article_analyzer.models.encoders.hub_averaging_encoder import HubEncoder
from habr_article_analyzer.models.encoders.tf_idf_encoder import TextEncoder
from habr_article_analyzer.models.predictors.ranking_nn import RankingModel
from habr_article_analyzer.settings import data_settings, settings
from habr_article_analyzer.targets import Target
from torch.utils.data import DataLoader
from tqdm import tqdm

In [2]:
test_df = (
    HabrDataset(
        path=settings.raw_data_dir / "test.jsonl.zst",
        batch_size=data_settings.batch_size,
    )
    .get_dataframe()
    .sample(500)
)

Reading dataset: 60408it [00:03, 16223.23it/s]


In [3]:
targets = Target(test_df, "hubs", sparse=True)
hubs = targets.labels
texts = test_df["text_markdown"]

In [4]:
sergiy_text_encoder = TextEncoder(max_features=5000).load(
    settings.models_dir / "text_encoder.pt"
)
sergiy_hub_encoder = HubEncoder(dim=5000).load(settings.models_dir / "hub_encoder.pt")
sergiy_model = RankingModel(input_dim=10000)
sergiy_model.load_state_dict(torch.load(settings.models_dir / "bow_dssm_1.pt"))
sergiy_model.to("cuda")
sergiy_model_predictor = sergiy_model

In [5]:
sergiy_hub_embeds = sergiy_hub_encoder.transform(hubs)
sergiy_text_embeds = sergiy_text_encoder.transform(texts)
sergiy_dataset = FullLazyRankingDataset(sergiy_text_embeds, sergiy_hub_embeds, targets)

sergiy_predicts = []
loader = DataLoader(sergiy_dataset, batch_size=256, shuffle=False)

with torch.no_grad():
    for features, labels in tqdm(loader, desc="applying model to test dataset"):
        gpu_features = features.to("cuda")
        gpu_predicts = sergiy_model_predictor(gpu_features)
        for predict in gpu_predicts.to("cpu"):
            sergiy_predicts.append(predict.item())

sergiy_predicts = np.array(sergiy_predicts)

applying model to test dataset: 100%|██████████| 645/645 [00:21<00:00, 30.69it/s]


In [6]:
nikita_model = BaselineWord2VecKNN.load(
    settings.models_dir / "baseline_word2vec_knn.pickle"
)
nikita_hub_encoder = nikita_model.hub_encoder
nikita_text_encoder = nikita_model.text_encoder
nikita_model_predictor = nikita_model.predictor

In [7]:
nikita_hub_embeds = np.array([nikita_hub_encoder.encode(hub) for hub in hubs])
nikita_text_embeds = np.array([nikita_text_encoder.encode(text) for text in texts])
nikita_dataset = FullLazyRankingDataset(nikita_text_embeds, nikita_hub_embeds, targets)

nikita_predicts = []
loader = DataLoader(nikita_dataset, batch_size=256, shuffle=False)

for features, labels in tqdm(loader, desc="applying model to test dataset"):
    predicts = nikita_model_predictor.model.predict_proba(features)
    nikita_predicts.extend(predicts[:, 1])

nikita_predicts = np.array(nikita_predicts)

applying model to test dataset: 100%|██████████| 645/645 [00:54<00:00, 11.89it/s]


In [11]:
for name, predicts in [("Sergiy", sergiy_predicts), ("Nikita", nikita_predicts)]:
    predicts = np.reshape(np.array(predicts), shape=(len(test_df), len(targets)))
    metrics = RankingMetrics(targets, predicts)
    print(f"{name}'s dcg score: {metrics.dcg().mean().item()}")
    print(f"{name}'s ndcg score: {metrics.ndcg().mean().item()}")

Sergiy's dcg score: 0.10124032321219213
Sergiy's ndcg score: 0.0751940535996023
Nikita's dcg score: 0.043399153681384396
Nikita's ndcg score: 0.028028323935908803


Well, definetly not random ranking, but we have a lot of space for improvement.