# ML offline-validation pipeline

Achieved results by tuned and evaluated methods:

| Model     | HitRate@10 | Precision@10 | Recall@10 | MRR@10 | NDCG@10 |
|-----------|------------|--------------|-----------|--------|---------|
| Graph     | 0.836      | 0.141        | 0.513     | 0.524  | 0.414   |
| EASE      | 0.834      | 0.139        | 0.502     | 0.538  | 0.415   |
| SLIM      | 0.821      | 0.135        | 0.486     | 0.521  | 0.401   |
| item-KNN  | 0.804      | 0.129        | 0.467     | 0.491  | 0.378   |

All results are statistically significant on 5\% except for the fact that Graph model is better than EASE: the diff CI includes 0

(the results are in the end of this file)

## EASE

In [16]:
import pandas as pd
import numpy as np

data = pd.read_csv('recipes_normalized.csv')

In [17]:
stop_words_drop = ['Соль', 'Сахар-песок', 'Перец черный молотый', 'Мука пшеничная',
'Сода', 'Сода гашеная уксусом']

In [None]:
from tqdm import tqdm
import ast

interactions = []
for idx, row in tqdm(data.iterrows(), total=len(data)):
    ingredients_parsed = ast.literal_eval(row['ingredients_normalized'])
    recipe_id = row.get('url', idx) 
        
    for ingredient in ingredients_parsed.keys():
        interactions.append((recipe_id, ingredient))

interactions_df = pd.DataFrame(interactions, columns=['recipe_id', 'ingredient_id'])
print(f"Interactions {len(interactions_df)}")

unique_recipes = interactions_df['recipe_id'].unique()
all_unique_ingredients = interactions_df['ingredient_id'].unique()
unique_ingredients = [
    ingredient for ingredient in all_unique_ingredients if ingredient not in stop_words_drop
]

recipe2id = {recipe: i for i, recipe in enumerate(unique_recipes)}
item2id = {ingredient: i for i, ingredient in enumerate(unique_ingredients)}

100%|██████████| 146581/146581 [00:10<00:00, 13919.75it/s]


Interactions 1278324


In [None]:
from validate import RecommendationValidator

validator = RecommendationValidator(
    data_path='recipes_normalized.csv',  
    recipe2id=recipe2id,         
    item2id=item2id
)

  from .autonotebook import tqdm as notebook_tqdm
Preparing interactions: 100%|██████████| 146581/146581 [00:08<00:00, 16959.92it/s]


interactions left after k-core filtering: 978048


In [22]:
matrix_train = validator.build_train_matrix(fmt='coo') 

In [None]:
from models import fit_ease, apply_linear_model

w_ease = fit_ease(matrix_train, reg_weight=77)

def ease_predict(items):
    return apply_linear_model(items, w_ease, top_k=validator.metrics.top_k)

ease_result = validator.evaluate_split(
    'test',
    predictor=ease_predict,
    preds_col='ease_preds',
    top_k=10
)
ease_result

test predictions: 100%|██████████| 24661/24661 [00:02<00:00, 8495.16it/s]


{'split': 'test',
 'metrics': {'hit_rate@10': 0.834556587324115,
  'precision@10': 0.1388832569644378,
  'recall@10': 0.502290648495182,
  'mrr@10': 0.5380562008131855,
  'ndcg@10': 0.4150810959732856}}

In [74]:
import optuna

def ease_objective(trial):
    reg = trial.suggest_float('reg_weight', 10, 2000, log=True)
    w = np.asarray(fit_ease(matrix_train, reg_weight=reg))

    def predict(items):
        return apply_linear_model(items, w, top_k=10)

    metrics = validator.evaluate_split(
        'val',
        predictor=predict,
        preds_col='ease_val_preds',
        top_k=10,
        cross_val_folds=5
    )['cross_val_mean']
    return metrics['precision@10']

study = optuna.create_study(direction='maximize')
study.optimize(ease_objective, n_trials=30)

[I 2025-11-16 18:34:25,822] A new study created in memory with name: no-name-c10e246b-7fde-44af-97a0-427ed450de5d
val predictions: 100%|██████████| 18496/18496 [00:01<00:00, 9361.62it/s]
[I 2025-11-16 18:34:28,275] Trial 0 finished with value: 0.13602923653580587 and parameters: {'reg_weight': 1186.4723566676482}. Best is trial 0 with value: 0.13602923653580587.
val predictions: 100%|██████████| 18496/18496 [00:01<00:00, 9519.66it/s]
[I 2025-11-16 18:34:30,815] Trial 1 finished with value: 0.1389595858632355 and parameters: {'reg_weight': 175.7820145116139}. Best is trial 1 with value: 0.1389595858632355.
val predictions: 100%|██████████| 18496/18496 [00:02<00:00, 8245.13it/s]
[I 2025-11-16 18:34:33,436] Trial 2 finished with value: 0.13925695476498398 and parameters: {'reg_weight': 19.07221600272422}. Best is trial 2 with value: 0.13925695476498398.
val predictions: 100%|██████████| 18496/18496 [00:02<00:00, 8873.00it/s]
[I 2025-11-16 18:34:35,930] Trial 3 finished with value: 0.13927

In [75]:
study.best_params

{'reg_weight': 58.065487933492136}

In [190]:
w_ease = fit_ease(matrix_train, reg_weight=58.065487933492136)

def ease_predict(items):
    return apply_linear_model(items, w_ease, top_k=validator.metrics.top_k)

ease_result = validator.evaluate_split(
    'test',
    predictor=ease_predict,
    preds_col='ease_preds',
    top_k=10,
    cross_val_folds=5
)

test predictions: 100%|██████████| 24661/24661 [00:01<00:00, 13405.80it/s]


In [77]:
ease_result.keys()

dict_keys(['split', 'metrics', 'cross_val_mean', 'fold_metrics'])

In [78]:
ease_result['cross_val_mean']

{'hit_rate@10': 0.8343135649495617,
 'precision@10': 0.13889141750059064,
 'recall@10': 0.5022488789955852,
 'mrr@10': 0.5384045392270117,
 'ndcg@10': 0.4153384524832188}

In [80]:
ease_result['fold_metrics']

[{'fold': 1,
  'hit_rate@10': 0.827488343807014,
  'precision@10': 0.13764443543482668,
  'recall@10': 0.4989931752145415,
  'mrr@10': 0.5301737730026802,
  'ndcg@10': 0.4107124612706702},
 {'fold': 2,
  'hit_rate@10': 0.8327250608272506,
  'precision@10': 0.13799675587996757,
  'recall@10': 0.5038335650561928,
  'mrr@10': 0.5363545810321967,
  'ndcg@10': 0.4157345923756216},
 {'fold': 3,
  'hit_rate@10': 0.8390105433901054,
  'precision@10': 0.138544201135442,
  'recall@10': 0.5008813514592103,
  'mrr@10': 0.5445224095315336,
  'ndcg@10': 0.4167768735199129},
 {'fold': 4,
  'hit_rate@10': 0.8363746958637469,
  'precision@10': 0.1399026763990268,
  'recall@10': 0.5021161607384235,
  'mrr@10': 0.5358735147208383,
  'ndcg@10': 0.4142977830933171},
 {'fold': 5,
  'hit_rate@10': 0.8359691808596919,
  'precision@10': 0.1403690186536902,
  'recall@10': 0.5054201425095586,
  'mrr@10': 0.5450984178478097,
  'ndcg@10': 0.41917055215657223}]

In [160]:
validator.inspect_user(user_id=31, split='test', preds_col='ease_preds')

RECIPE: https://www.povarenok.ru/recipes/show/39037/ (User ID: 31) | split: test
Observed Ingredients (5)
Специи, Масло растительное, Зелень, Капуста пекинская, Авокадо

Ground Truth Ingredients (3)
Креветки, Лимон, Консервы рыбные

Top Recommended Ingredients (10)
Сок лимона, Помидор, Огурец, Чеснок, Лук репчатый, Лимон, Перец болгарский, Креветки, Соевый соус, Вода


In [88]:
auto_summary, auto_details = validator.autoregressive_validation(
       split='val',
       predictor=ease_predict,
       top_k=10,
       max_steps=8,
       return_detailed=True
)
print(auto_summary)

val autoregressive: 100%|██████████| 10234/10234 [00:03<00:00, 3329.33it/s]

   revealed_items  hit_rate@10  precision@10  recall@10    mrr@10   ndcg@10
0               1     0.819621      0.125679   0.522608  0.516059  0.417855
1               2     0.642759      0.076861   0.549470  0.371791  0.383278
2               3     0.600417      0.068640   0.530532  0.339209  0.363053
3               4     0.559242      0.063033   0.500612  0.306692  0.332727
4               5     0.551136      0.059659   0.502367  0.291184  0.324069
5               6     0.500000      0.057143   0.410714  0.278671  0.280131
6               7     0.285714      0.042857   0.285714  0.214286  0.241918





## SLIM

In [129]:
%reload_ext autoreload
%autoreload 2

In [132]:
import warnings
warnings.filterwarnings('ignore')

In [134]:
from models import train_slim

w_slim = train_slim(matrix_train, l1_reg=1e-3, l2_reg=1e-3)

def slim_predict(items):
    return apply_linear_model(items, w_slim, top_k=validator.metrics.top_k)

slim_result = validator.evaluate_split(
    'test',
    predictor=slim_predict,
    preds_col='slim_preds',
    top_k=10
)
slim_result

test predictions: 100%|██████████| 24661/24661 [00:01<00:00, 23525.43it/s]


{'split': 'test',
 'metrics': {'hit_rate@10': 0.790397794087831,
  'precision@10': 0.12421637403187219,
  'recall@10': 0.4487766977355801,
  'mrr@10': 0.4954293232100296,
  'ndcg@10': 0.3719911061526999}}

In [135]:
def slim_objective(trial):
    l1 = trial.suggest_float('l1_reg', 1e-5, 1e-2, log=True)
    l2 = trial.suggest_float('l2_reg', 1e-5, 1e-2, log=True)

    w = train_slim(matrix_train, l1_reg=l1, l2_reg=l2)

    def predict(items):
        return apply_linear_model(items, w, top_k=10)

    metrics = validator.evaluate_split(
        'val',
        predictor=predict,
        preds_col='slim_val_preds',
        top_k=10
    )['metrics']
    return metrics['precision@10']

study = optuna.create_study(direction='maximize')
study.optimize(slim_objective, n_trials=30)

[I 2025-11-16 18:57:50,402] A new study created in memory with name: no-name-c35bbf32-9483-44ba-a0fa-9f1439c3445b
val predictions: 100%|██████████| 18496/18496 [00:00<00:00, 27488.83it/s]
[I 2025-11-16 18:58:03,343] Trial 0 finished with value: 0.11124567474048445 and parameters: {'l1_reg': 0.004400122136818884, 'l2_reg': 0.0007216616213845809}. Best is trial 0 with value: 0.11124567474048445.
val predictions: 100%|██████████| 18496/18496 [00:01<00:00, 15997.61it/s]
[I 2025-11-16 18:58:48,717] Trial 1 finished with value: 0.1325475778546713 and parameters: {'l1_reg': 6.252486886183154e-05, 'l2_reg': 0.004729150489798778}. Best is trial 1 with value: 0.1325475778546713.
val predictions: 100%|██████████| 18496/18496 [00:01<00:00, 16967.17it/s]
[I 2025-11-16 18:59:31,420] Trial 2 finished with value: 0.1332288062283737 and parameters: {'l1_reg': 6.23958379128446e-05, 'l2_reg': 0.0027592271180808276}. Best is trial 2 with value: 0.1332288062283737.
val predictions: 100%|██████████| 18496/1

In [136]:
study.best_params

{'l1_reg': 1.0086930699538735e-05, 'l2_reg': 9.519524623817742e-05}

In [137]:
w_slim= train_slim(matrix_train, l1_reg=study.best_params['l1_reg'], l2_reg=study.best_params['l2_reg'])

def slim_predict(items):
    return apply_linear_model(items, w_slim, top_k=validator.metrics.top_k)

slim_result = validator.evaluate_split(
    'test',
    predictor=slim_predict,
    preds_col='slim_preds',
    top_k=10,
    cross_val_folds=5
)

test predictions: 100%|██████████| 24661/24661 [00:02<00:00, 11923.04it/s]


In [138]:
slim_result['cross_val_mean']

{'hit_rate@10': 0.8209726145433974,
 'precision@10': 0.1345890512757405,
 'recall@10': 0.48611325101602787,
 'mrr@10': 0.5209067806251786,
 'ndcg@10': 0.4007724875225985}

In [139]:
slim_result['fold_metrics']

[{'fold': 1,
  'hit_rate@10': 0.8153253598216096,
  'precision@10': 0.13399554023920537,
  'recall@10': 0.48572200824380024,
  'mrr@10': 0.5133929576966269,
  'ndcg@10': 0.397572752595016},
 {'fold': 2,
  'hit_rate@10': 0.818734793187348,
  'precision@10': 0.13290754257907542,
  'recall@10': 0.4844900166068049,
  'mrr@10': 0.5182396464939043,
  'ndcg@10': 0.399079139316803},
 {'fold': 3,
  'hit_rate@10': 0.8262368207623683,
  'precision@10': 0.13540145985401458,
  'recall@10': 0.488445558001519,
  'mrr@10': 0.5240411662096577,
  'ndcg@10': 0.40312901882358104},
 {'fold': 4,
  'hit_rate@10': 0.8199513381995134,
  'precision@10': 0.13580697485806975,
  'recall@10': 0.485753147568841,
  'mrr@10': 0.5209637579011058,
  'ndcg@10': 0.40114853386459276},
 {'fold': 5,
  'hit_rate@10': 0.8246147607461476,
  'precision@10': 0.13483373884833738,
  'recall@10': 0.4861555246591743,
  'mrr@10': 0.5278963748245987,
  'ndcg@10': 0.4029329930129994}]

In [157]:
validator.inspect_user(user_id=31, split='test', preds_col='slim_preds')

RECIPE: https://www.povarenok.ru/recipes/show/39037/ (User ID: 31) | split: test
Observed Ingredients (5)
Специи, Масло растительное, Зелень, Капуста пекинская, Авокадо

Ground Truth Ingredients (3)
Креветки, Лимон, Консервы рыбные

Top Recommended Ingredients (10)
Сок лимона, Чеснок, Лук репчатый, Огурец, Помидор, Креветки, Перец болгарский, Картофель, Салат, Лимон


In [141]:
auto_summary, auto_details = validator.autoregressive_validation(
       split='val',
       predictor=slim_predict,
       top_k=10,
       max_steps=8,
       return_detailed=True
)
print(auto_summary)

val autoregressive: 100%|██████████| 10234/10234 [00:02<00:00, 4460.77it/s]


   revealed_items  hit_rate@10  precision@10  recall@10    mrr@10   ndcg@10
0               1     0.804671      0.121214   0.503691  0.499140  0.401544
1               2     0.619406      0.073402   0.524602  0.354242  0.364215
2               3     0.576317      0.065159   0.504453  0.318979  0.341600
3               4     0.549763      0.061019   0.490936  0.284847  0.314002
4               5     0.500000      0.054545   0.455966  0.269111  0.298134
5               6     0.571429      0.067857   0.500000  0.306746  0.327860
6               7     0.428571      0.057143   0.428571  0.261905  0.307274


## Item KNN

In [145]:
from models import train_itemknn

W_itemknn = train_itemknn(matrix_train, topk=200, shrink=100.0, use_binary=True)

def itemknn_predict(items):
    return apply_linear_model(items, W_itemknn, top_k=validator.metrics.top_k)

itemknn_result = validator.evaluate_split(
    'test',
    predictor=itemknn_predict,
    preds_col='itemknn_preds',
    top_k=10
)
itemknn_result

test predictions: 100%|██████████| 24661/24661 [00:08<00:00, 2917.81it/s]


{'split': 'test',
 'metrics': {'hit_rate@10': 0.7963586229268886,
  'precision@10': 0.12664531040914806,
  'recall@10': 0.45800592542817115,
  'mrr@10': 0.4873719541748008,
  'ndcg@10': 0.3722513429362359}}

In [146]:
def itemknn_objective(trial):
    topk = trial.suggest_int('topk', 50, 400)
    shrink = trial.suggest_float('shrink', 1.0, 500.0, log=True)
    use_binary = trial.suggest_categorical('use_binary', [True, False])

    W = train_itemknn(matrix_train, topk=topk, shrink=shrink, use_binary=use_binary)

    def predict(items):
        return apply_linear_model(items, W, top_k=10)

    metrics = validator.evaluate_split(
        'val',                        
        predictor=predict,
        preds_col='itemknn_val_preds',
        top_k=10,
    )['metrics']
    return metrics['precision@10']   

study = optuna.create_study(direction='maximize')
study.optimize(itemknn_objective, n_trials=30)

[I 2025-11-16 19:22:34,770] A new study created in memory with name: no-name-bb3cc4ae-9a72-4d96-b03b-6f3a969cc6b8
val predictions: 100%|██████████| 18496/18496 [00:05<00:00, 3389.45it/s]
[I 2025-11-16 19:22:40,560] Trial 0 finished with value: 0.1268003892733564 and parameters: {'topk': 364, 'shrink': 85.44837178858262, 'use_binary': True}. Best is trial 0 with value: 0.1268003892733564.
val predictions: 100%|██████████| 18496/18496 [00:05<00:00, 3384.93it/s]
[I 2025-11-16 19:22:46,265] Trial 1 finished with value: 0.12799524221453287 and parameters: {'topk': 388, 'shrink': 22.059126177060794, 'use_binary': True}. Best is trial 1 with value: 0.12799524221453287.
val predictions: 100%|██████████| 18496/18496 [00:06<00:00, 2955.52it/s]
[I 2025-11-16 19:22:52,805] Trial 2 finished with value: 0.12688148788927336 and parameters: {'topk': 139, 'shrink': 80.89488366587021, 'use_binary': True}. Best is trial 1 with value: 0.12799524221453287.
val predictions: 100%|██████████| 18496/18496 [00:

In [147]:
print(study.best_params, study.best_value)

{'topk': 83, 'shrink': 1.2726236420696848, 'use_binary': False} 0.1290062716262976


In [181]:
best_params = study.best_params
W_itemknn = train_itemknn(matrix_train, **best_params)
itemknn_result = validator.evaluate_split(
    'test', 
    predictor=itemknn_predict, 
    preds_col='itemknn_preds', 
    top_k=10,
    cross_val_folds=5)

test predictions: 100%|██████████| 24661/24661 [00:18<00:00, 1360.07it/s]


In [149]:
itemknn_result['cross_val_mean']

{'hit_rate@10': 0.8042660704535669,
 'precision@10': 0.12916350960124387,
 'recall@10': 0.4670517856507077,
 'mrr@10': 0.4913607057990087,
 'ndcg@10': 0.3777596238388176}

In [150]:
itemknn_result['fold_metrics']

[{'fold': 1,
  'hit_rate@10': 0.7987026150415568,
  'precision@10': 0.12785323332657614,
  'recall@10': 0.4640769646597744,
  'mrr@10': 0.48705848850791067,
  'ndcg@10': 0.37408461553456424},
 {'fold': 2,
  'hit_rate@10': 0.7942011354420113,
  'precision@10': 0.12745336577453367,
  'recall@10': 0.4643861275248136,
  'mrr@10': 0.4847001763668429,
  'ndcg@10': 0.3749797169585925},
 {'fold': 3,
  'hit_rate@10': 0.8116382806163828,
  'precision@10': 0.13006893755068938,
  'recall@10': 0.47106634354201266,
  'mrr@10': 0.49476684496453355,
  'ndcg@10': 0.38091724683795636},
 {'fold': 4,
  'hit_rate@10': 0.8043390105433901,
  'precision@10': 0.13065693430656936,
  'recall@10': 0.4687038968060866,
  'mrr@10': 0.49268809781279366,
  'ndcg@10': 0.37881525047666426},
 {'fold': 5,
  'hit_rate@10': 0.8124493106244931,
  'precision@10': 0.1297850770478508,
  'recall@10': 0.4670255957208512,
  'mrr@10': 0.49758992134296276,
  'ndcg@10': 0.3800012893863107}]

In [159]:
validator.inspect_user(user_id=31, split='test', preds_col='itemknn_preds')

RECIPE: https://www.povarenok.ru/recipes/show/39037/ (User ID: 31) | split: test
Observed Ingredients (5)
Специи, Масло растительное, Зелень, Капуста пекинская, Авокадо

Ground Truth Ingredients (3)
Креветки, Лимон, Консервы рыбные

Top Recommended Ingredients (10)
Лук репчатый, Чеснок, Морковь, Помидор, Яйцо куриное, Перец болгарский, Картофель, Соевый соус, Сыр твердый, Вода


## Graph model (weighted_score)

вайбкодингом упрощали эту модель вместе, а то там многовато и сильно на других данных

In [176]:
from validate import RecommendationValidator
from graph_model import create_graph_predictor
from graph_builder_simple import SimpleBipartiteGraph, SimpleGraphRecommender

validator = RecommendationValidator(data_path='recipes_normalized.csv')

recipes = pd.read_csv('recipes_normalized.csv')

graph_builder = SimpleBipartiteGraph(recipes, min_ingredient_freq=5)
graph_builder.prepare_data()          
graph_builder.build_graph()            
graph_builder.compute_ingredient_weights() 

recommender = SimpleGraphRecommender(graph_builder)

graph_predictor = create_graph_predictor(
    recommender=recommender,
    id2item=validator.id2item,
    top_k=10,
    method='weighted_scoring_v2',
)

graph_result = validator.evaluate_split(
    split='test',
    predictor=graph_predictor,
    preds_col='graph_preds',
    top_k=10,
    cross_val_folds=5
)

Preparing interactions: 100%|██████████| 146581/146581 [00:08<00:00, 16791.87it/s]


interactions left after k-core filtering: 978048


Подсчет частот: 100%|██████████| 145372/145372 [00:00<00:00, 548011.79it/s]


   Всего уникальных ингредиентов: 979
   Частых ингредиентов (>=5): 875


Добавление рецептов: 100%|██████████| 145362/145362 [00:05<00:00, 25588.85it/s]


Узлов рецептов: 145362
Узлов ингредиентов: 875
Ребер: 1275847


Вычисление весов: 100%|██████████| 875/875 [00:01<00:00, 647.22it/s]



 SimpleGraphRecommender инициализирован
   Уникальных ингредиентов: 875
   Топ-5 популярных: [('Соль', 77978), ('Яйцо куриное', 58192), ('Масло растительное', 57529), ('Мука пшеничная', 50675), ('Сахар-песок', 50375)]


test predictions: 100%|██████████| 24661/24661 [2:50:22<00:00,  2.41it/s]  


In [178]:
graph_result['cross_val_mean']

{'hit_rate@10': 0.8360165635575101,
 'precision@10': 0.1414784848519225,
 'recall@10': 0.5125739049036515,
 'mrr@10': 0.5237337318745215,
 'ndcg@10': 0.414380906659573}

In [179]:
graph_result['fold_metrics']

[{'fold': 1,
  'hit_rate@10': 0.8315426718021488,
  'precision@10': 0.14056355159132372,
  'recall@10': 0.5116320600812796,
  'mrr@10': 0.5194259425508158,
  'ndcg@10': 0.4117463638431067},
 {'fold': 2,
  'hit_rate@10': 0.8294809407948094,
  'precision@10': 0.13943633414436335,
  'recall@10': 0.5088484339396747,
  'mrr@10': 0.5212404575239125,
  'ndcg@10': 0.4125342779496018},
 {'fold': 3,
  'hit_rate@10': 0.8440794809407948,
  'precision@10': 0.14211273317112735,
  'recall@10': 0.5155260752584353,
  'mrr@10': 0.5304247286911521,
  'ndcg@10': 0.41893833727206103},
 {'fold': 4,
  'hit_rate@10': 0.833941605839416,
  'precision@10': 0.1425993511759935,
  'recall@10': 0.5115564534816359,
  'mrr@10': 0.5236736280075696,
  'ndcg@10': 0.41351803354078714},
 {'fold': 5,
  'hit_rate@10': 0.8410381184103812,
  'precision@10': 0.14268045417680456,
  'recall@10': 0.5153065017572317,
  'mrr@10': 0.523903902599158,
  'ndcg@10': 0.4151675206923084}]

## Check stat significance

##### Что тестируем

$H_0$: две модели не различаются по качеству (средние метрики равны) 

$H_1$: модели различаются (средние метрики не равны)

используем Wilcoxon signed-rank test (не делает предположения о том, что разности распределены нормально, в отличие от paired t-test, а также о независимости наблюдений)

1. Для каждого пользователя считаем метрику у обеих моделей
2. Вычисляем разницу (модель A − модель B)
3. Ранжируем абсолютные разницы и суммируем ранги с учётом знака
4. Если модели одинаковы, сумма рангов близка к нулю. Если одна модель стабильно лучше, сумма заметно отклоняется

напоминание всем читающим: $p_{val} < 0.05 \Rightarrow$ отклоняем $H_0$

у нас достаточно большая тестовая выборка (24к корзин), получили хоть и маленькие, но стат значимые на 5\% различия между моделями (EASE лучше SLIM, SLIM лучше ItemKNN): но это вообще говоря не говорит о практически важном различии результатов, в общем, надо мерять онлайн

**бутстрап сделан для проверки ДИ для разницы значения метрики, по которой сравниваем модели

In [None]:
from stats import compare_models_paired

#per-user метрики для каждой модели
per_user_graph = validator.get_per_user_metrics('test', 'graph_preds', top_k=10)
per_user_ease = validator.get_per_user_metrics('test', 'ease_preds', top_k=10)

result = compare_models_paired(
    per_user_graph,
    per_user_ease,
    metric_name='precision@10',
    test_type='wilcoxon',
    alpha=0.05
)

print(f"p-value: {result['p_value']:.4f}")
print(f"Significant: {result['significant']}")
print(f"Mean difference (Graph - EASE) by precision@10: {result['mean_diff']:.4f}")

p-value: 0.0000
Significant: True
Mean difference (Graph - EASE): 0.1394


In [161]:
from stats import compare_models_paired

#per-user метрики для каждой модели
per_user_ease = validator.get_per_user_metrics('test', 'ease_preds', top_k=10)
per_user_slim = validator.get_per_user_metrics('test', 'slim_preds', top_k=10)

result = compare_models_paired(
    per_user_ease,
    per_user_slim,
    metric_name='precision@10',
    test_type='wilcoxon',
    alpha=0.05,
)

print(f"p-value: {result['p_value']:.4f}")
print(f"Significant: {result['significant']}")
print(f"Mean difference (EASE - SLIM): {result['mean_diff']:.4f}")

p-value: 0.0000
Significant: True
Mean difference (EASE - SLIM): 0.0043


In [162]:
from stats import bootstrap_confidence_interval

ci = bootstrap_confidence_interval(
    per_user_ease,
    per_user_slim,
    metric_name='precision@10',
    n_bootstrap=1000,
    confidence=0.95,
)

print(f"Mean difference: {ci['mean_diff']:.4f}")
print(f"95% CI: [{ci['lower_bound']:.4f}, {ci['upper_bound']:.4f}]")

Mean difference: 0.0043
95% CI: [0.0038, 0.0048]


In [None]:
from stats import bootstrap_confidence_interval

per_user_graph = validator.get_per_user_metrics('test', 'graph_preds', top_k=10)

ci = bootstrap_confidence_interval(
    per_user_graph,
    per_user_ease,
    metric_name='precision@10',
    n_bootstrap=1000,
    confidence=0.95
)

print(f"Mean difference: {ci['mean_diff']:.4f}")
print(f"95% CI: [{ci['lower_bound']:.4f}, {ci['upper_bound']:.4f}]")

Mean difference: 0.0008
95% CI: [-0.0021, 0.0038]


In [183]:
from stats import compare_multiple_models

per_user_itemknn = validator.get_per_user_metrics('test', 'itemknn_preds', top_k=10)
per_user_graph = validator.get_per_user_metrics('test', 'graph_preds', top_k=10)

all_models = {
    'graph': per_user_graph,
    'EASE': per_user_ease,
    'SLIM': per_user_slim,
    'item-kNN': per_user_itemknn
}

comparisons = compare_multiple_models(
    all_models,
    metric_name='precision@10',
    alpha=0.05,
)

comparisons

Unnamed: 0,model_a,model_b,p_value,significant,mean_diff,mean_a,mean_b
2,graph,item-kNN,0.0,True,0.138628,0.141478,0.002851
4,EASE,item-kNN,0.0,True,0.136791,0.13973,0.00294
5,SLIM,item-kNN,0.0,True,0.132559,0.135499,0.00294
3,EASE,SLIM,2.9792640000000002e-52,True,0.004302,0.138891,0.134589
1,graph,SLIM,0.001010994,True,0.005074,0.140573,0.135499
0,graph,EASE,0.5131311,False,0.000843,0.140573,0.13973


In [187]:
metrics_to_test = ['hit_rate@10', 'precision@10', 'recall@10', 'mrr@10', 'ndcg@10']

for metric in metrics_to_test:
    result = compare_models_paired(per_user_graph, per_user_ease, metric)
    print(f"{metric}:")
    print(f"p-value: {result['p_value']:.4f}")
    print(f"Significant: {result['significant']}")
    print(f"Graph mean: {result['mean_a']:.4f}")
    print(f"EASE mean: {result['mean_b']:.4f}")
    print(f"Difference: {result['mean_diff']:.4f}")
    print()

hit_rate@10:
p-value: 0.7305
Significant: False
Graph mean: 0.8373
EASE mean: 0.8395
Difference: -0.0022

precision@10:
p-value: 0.5131
Significant: False
Graph mean: 0.1406
EASE mean: 0.1397
Difference: 0.0008

recall@10:
p-value: 0.1504
Significant: False
Graph mean: 0.5108
EASE mean: 0.5041
Difference: 0.0067

mrr@10:
p-value: 0.0159
Significant: True
Graph mean: 0.5238
EASE mean: 0.5404
Difference: -0.0167

ndcg@10:
p-value: 0.6100
Significant: False
Graph mean: 0.4136
EASE mean: 0.4160
Difference: -0.0025



In [165]:
metrics_to_test = ['hit_rate@10', 'precision@10', 'recall@10', 'mrr@10', 'ndcg@10']

for metric in metrics_to_test:
    result = compare_models_paired(per_user_ease, per_user_slim, metric)
    print(f"{metric}:")
    print(f"p-value: {result['p_value']:.4f}")
    print(f"Significant: {result['significant']}")
    print(f"EASE mean: {result['mean_a']:.4f}")
    print(f"SLIM mean: {result['mean_b']:.4f}")
    print(f"Difference: {result['mean_diff']:.4f}")
    print()

hit_rate@10:
p-value: 0.0000
Significant: True
EASE mean: 0.8343
SLIM mean: 0.8210
Difference: 0.0133

precision@10:
p-value: 0.0000
Significant: True
EASE mean: 0.1389
SLIM mean: 0.1346
Difference: 0.0043

recall@10:
p-value: 0.0000
Significant: True
EASE mean: 0.5022
SLIM mean: 0.4861
Difference: 0.0161

mrr@10:
p-value: 0.0000
Significant: True
EASE mean: 0.5384
SLIM mean: 0.5209
Difference: 0.0175

ndcg@10:
p-value: 0.0000
Significant: True
EASE mean: 0.4153
SLIM mean: 0.4008
Difference: 0.0146



Все таки прочекаем нормальное ли распределение разностей между метриками различных моделей и результаты paired t-testа (на всякий случай)

In [None]:
from stats import compare_both_tests

comparison = compare_both_tests(
    per_user_ease,
    per_user_slim,
    metric_name='precision@10',
    alpha=0.05,
)

comparison

Unnamed: 0,metric,n_users,is_normal,normality_p_value,skewness,kurtosis,wilcoxon_p_value,wilcoxon_significant,ttest_p_value,ttest_significant,mean_diff,mean_a,mean_b
0,precision@10,24661,False,0.0,0.423609,4.226005,2.9792640000000002e-52,True,3.5118e-60,True,0.004302,0.138891,0.134589


In [None]:
comparison = compare_both_tests(
    per_user_graph,
    per_user_ease,
    metric_name='precision@10',
    alpha=0.05
)

comparison

Unnamed: 0,metric,n_users,is_normal,normality_p_value,skewness,kurtosis,wilcoxon_p_value,wilcoxon_significant,ttest_p_value,ttest_significant,mean_diff,mean_a,mean_b
0,precision@10,24661,False,0.0,0.391198,0.289232,0.0,True,0.0,True,0.139394,0.141478,0.002084


In [169]:
from stats import check_normality

metrics_to_test = ['hit_rate@10', 'precision@10', 'recall@10', 'mrr@10', 'ndcg@10']

all_comparisons = []
for metric in metrics_to_test:
    comp = compare_both_tests(per_user_ease, per_user_slim, metric)
    all_comparisons.append(comp)

results_df = pd.concat(all_comparisons, ignore_index=True)
results_df[['metric', 'is_normal', 'wilcoxon_p_value', 'ttest_p_value', 
                  'wilcoxon_significant', 'ttest_significant', 'mean_diff']]

Unnamed: 0,metric,is_normal,wilcoxon_p_value,ttest_p_value,wilcoxon_significant,ttest_significant,mean_diff
0,hit_rate@10,False,6.424025000000001e-22,5.89936e-22,True,True,0.013341
1,precision@10,False,2.9792640000000002e-52,3.5118e-60,True,True,0.004302
2,recall@10,False,2.3207800000000002e-54,3.9659770000000004e-60,True,True,0.016136
3,mrr@10,False,2.590262e-111,2.665938e-67,True,True,0.017498
4,ndcg@10,False,2.7654219999999996e-168,2.8287600000000003e-128,True,True,0.014566
