In [1]:
import os

import gensim.downloader as api
import numpy as np
import pandas as pd
import requests
from gensim.models import KeyedVectors
from habr_article_analyzer.data import load_dataset_from_zst
from habr_article_analyzer.data_loader import HabrDataset
from habr_article_analyzer.models.baseline.baseline import BaselineWord2VecKNN
from habr_article_analyzer.models.encoders.word2vec_encoder import (
    BilingualWord2VecEncoder,
)
from habr_article_analyzer.models.predictors.knn_predictor import KNNPredictor
from habr_article_analyzer.settings import data_settings, settings
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import KFold
from tqdm.auto import tqdm

# Baseline

*Author: Nikita Zolin*

The goal of this notebook is to prepare the baseline. Our task: predict the probability of each hub for the given text. Let's explain how the model will work:

1. We take two inputs: text and hub, which we need to map to some vectors. Model `A` will map text to some vector in $R^n$, model `B` will map hub to some vector in $R^m$.
2. We concatenate these vectors to get one vector in $R^{n+m}$.
3. Model `C` estimates the probability based on this vector.

In this notebook we will use word2vec for models `A` and `B` and we will adjust KNN for model `C`.

However, before fitting the model we need to gather the dataset. Here, as it's just a baseline, we will simply take one positive and three random negative hubs for each text. This logic is implemented in [data_utils](../../src/habr_article_analyzer/data_utils/) and was run as a module before the code below.

Then, we need to install some pretrained word2vec. Let's download them: 

## Encoders

We will take the small ones to run it locally.

In [2]:
kv_en = api.load("glove-wiki-gigaword-300")

In [3]:
kv_ru = api.load("word2vec-ruscorpora-300")

## Model hyperparamateres optimization

Let's fix some metric, for example `ROC AUC`, and will find the best hyperparameter for the model. There we will run on the small sizes due to limited resources.

In [4]:
# Prepare mini-dataset for local run
np.random.seed(data_settings.random_seed)

dataset = HabrDataset(
    path=settings.raw_data_dir / "train_with_negatives.jsonl.zst",
    columns=["text", "hub", "label"],
    batch_size=data_settings.batch_size,
)

selected_rows = []
for batch_df in dataset:
    mask = np.random.rand(len(batch_df)) < 0.005  # Select 0.5% mask
    selected_rows.append(batch_df[mask])

train_df_sample = pd.concat(selected_rows, ignore_index=True)

Reading dataset: 1751498it [01:27, 20086.52it/s]


In [5]:
train_df_sample.shape

(8781, 3)

In [6]:
# Grid search for n_neighbors
encoder = BilingualWord2VecEncoder(kv_ru=kv_ru, kv_en=kv_en)

kfold = KFold(n_splits=3, shuffle=True, random_state=data_settings.random_seed)

texts = train_df_sample["text"]
hubs = train_df_sample["hub"]
labels = train_df_sample["label"]

result: dict[int, float] = {}

for n_neighbors in tqdm([3, 5, 7], desc="n_neighbors"):
    knn = KNNPredictor(n_neighbors=n_neighbors)

    model = BaselineWord2VecKNN(
        text_encoder=encoder, hub_encoder=encoder, predictor=knn
    )

    scores = []
    for train_idx, val_idx in tqdm(
        kfold.split(texts, labels), total=kfold.n_splits, desc=f"folds", leave=False
    ):
        # split
        X_train_texts = texts[train_idx]
        X_train_hubs = hubs[train_idx]
        y_train = labels[train_idx]

        X_val_texts = texts[val_idx]
        X_val_hubs = hubs[val_idx]
        y_val = labels[val_idx]

        # train
        model.fit(X_train_texts, X_train_hubs, y_train)

        # predict proba
        probas = [
            model.predict_proba(text, hub) for text, hub in zip(X_val_texts, X_val_hubs)
        ]

        # score
        score = roc_auc_score(y_val, probas)
        scores.append(score)

    result[n_neighbors] = np.array(scores).mean()

best_key = max(result, key=result.get)
best_value = result[best_key]

best_key, best_value, result

n_neighbors:   0%|          | 0/3 [00:00<?, ?it/s]

folds:   0%|          | 0/3 [00:00<?, ?it/s]

folds:   0%|          | 0/3 [00:00<?, ?it/s]

folds:   0%|          | 0/3 [00:00<?, ?it/s]

(7,
 np.float64(0.7247423154966111),
 {3: np.float64(0.7062915611986679),
  5: np.float64(0.7211842984326772),
  7: np.float64(0.7247423154966111)})

So now let's stick to this parameter and evaluate the model

In [7]:
knn = KNNPredictor(n_neighbors=best_key)
model = BaselineWord2VecKNN(text_encoder=encoder, hub_encoder=encoder, predictor=knn)

model.fit(train_df_sample["text"], train_df_sample["hub"], train_df_sample["label"])

model.save(settings.models_dir / "baseline_word2vec_knn.pickle")

In [8]:
test_df = load_dataset_from_zst(settings.raw_data_dir / "test_with_negatives.jsonl.zst")

# Decrease the test sample to run it locally
test_texts = test_df["text"].tolist()[:1000]
test_hubs = test_df["hub"].tolist()[:1000]
test_labels = test_df["label"].tolist()[:1000]


probas = [model.predict_proba(text, hub) for text, hub in zip(test_texts, test_hubs)]

list(zip(test_labels[:10], probas[:10]))

Reading records: 437367it [00:24, 18155.75it/s]


[(1, 0.7127623316361188),
 (1, 0.7140813997294929),
 (1, 0.7127623316361188),
 (1, 0.7127623316361188),
 (0, 0.7127623316361188),
 (0, 0.7127623316361188),
 (0, 0.7127623316361188),
 (0, 0.5625827767958712),
 (0, 0.0),
 (1, 0.0)]

# Evaluation

Let's use some standard metric to estimate this model:

In [9]:
probas = np.array(probas)
labels = np.array(test_labels)

# ROC AUC
roc_auc = roc_auc_score(labels, probas)
print(f"ROC AUC: {roc_auc:.4f}")

# Log Loss
ll = log_loss(labels, probas)
print(f"Log Loss: {ll:.4f}")

ROC AUC: 0.7311
Log Loss: 1.7679


On this step it's hard to say if the results are good because it's our first model, but now we have something and are able to compare our future models to these metrics. However, our main goal is to use one specific metric to compare different models, which will be presented in a different notebook.