In [None]:
just_checking_integrity=False

In [None]:
import gc
import pandas as pd
import scipy.sparse as sps
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ranker_types = ['xgboost', 'lightgbm', 'catboost']
model_types = ['graph_and_lstm', 'all_traditional_and_lstm']
dataset_types = ['leaderboard', 'final']

models_ranking_dict = {}

for ranker_type in ranker_types:
    for model_type in model_types:
        if ranker_type == 'catboost' and model_type == 'all_traditional_and_lstm':
            break
        for dataset_type in dataset_types:
            key = f'{ranker_type}_{model_type}_{dataset_type}'
            models_ranking_dict[key] = pd.read_csv(f'./drive/MyDrive/Recommendation_system/rankers/{ranker_type}/{model_type}/{dataset_type}/{dataset_type}_sub.csv')

In [None]:
purchased_in_sessions_dict = {}
for dataset_type in dataset_types:
    purchased_in_sessions_dict[dataset_type] = pd.read_csv(f'./drive/MyDrive/Recommendation_system/dataset/original_data/test_{dataset_type}_purchases.csv')
    purchased_in_sessions_dict[dataset_type] = purchased_in_sessions_dict[dataset_type].drop('date', axis=1)

In [None]:
for dataset_type in dataset_types:
    for ranker_type in ranker_types:
        for model_type in model_types:
            if ranker_type == 'catboost' and model_type == 'all_traditional_and_lstm':
                break

            purchased_df_col_name = f'{ranker_type}_{model_type}'
            ranking_df_key = f'{purchased_df_col_name}_{dataset_type}'

            purchased_df = purchased_in_sessions_dict[dataset_type]
            ranking_df = models_ranking_dict[ranking_df_key]

            purchased_df = pd.merge(purchased_df, ranking_df, left_on=['session_id', 'item_id'], right_on=['session_id', 'item_id'], how='left')
            purchased_df = purchased_df.rename(columns={'rank': purchased_df_col_name})

            purchased_in_sessions_dict[dataset_type] = purchased_df

In [None]:
for dataset_type in dataset_types:
    print('dataset type:', dataset_type)
    print()
    for ranker_type in ranker_types:
        for model_type in model_types:
            if ranker_type == 'catboost' and model_type == 'all_traditional_and_lstm':
                break

            purchased_df_col_name = f'{ranker_type}_{model_type}'

            model_ranking = np.array(purchased_in_sessions_dict[dataset_type][purchased_df_col_name])
            mrr = np.nansum(1 / model_ranking) / len(model_ranking)

            print(f'mrr for model {purchased_df_col_name}:', mrr)

    print('-----------------------------------------------')

dataset type: leaderboard

mrr for model xgboost_graph_and_lstm: 0.18970638849931315
mrr for model xgboost_all_traditional_and_lstm: 0.19261181912533581
mrr for model lightgbm_graph_and_lstm: 0.1918612865955328
mrr for model lightgbm_all_traditional_and_lstm: 0.19465534076005214
mrr for model catboost_graph_and_lstm: 0.19149389407521758
-----------------------------------------------
dataset type: final

mrr for model xgboost_graph_and_lstm: 0.19146008249415708
mrr for model xgboost_all_traditional_and_lstm: 0.19459936818624246
mrr for model lightgbm_graph_and_lstm: 0.19299816460638367
mrr for model lightgbm_all_traditional_and_lstm: 0.19635226966331254
mrr for model catboost_graph_and_lstm: 0.1928642719579282
-----------------------------------------------


In [None]:
# Calculation for mrr if we were know which model will bring the best rank for the recommendation

for dataset_type in dataset_types:
    print('dataset type:', dataset_type)
    print()
    best_combination_off_all_models_together = np.array(purchased_in_sessions_dict[dataset_type].drop(columns=['session_id', 'item_id'])).min(axis=1)
    mrr = np.nansum(1 / best_combination_off_all_models_together) / len(best_combination_off_all_models_together)

    print('best_combination_off_all_models_together', mrr)
    print('-----------------------------------------------')

dataset type: leaderboard

best_combination_off_all_models_together 0.2135278655465982
-----------------------------------------------
dataset type: final

best_combination_off_all_models_together 0.21521620795792307
-----------------------------------------------


In [None]:
from itertools import combinations

for dataset_type in dataset_types:
    model_options = []
    for ranker_type in ranker_types:
        for model_type in model_types:
            if ranker_type == 'catboost' and model_type == 'all_traditional_and_lstm':
                break

            purchased_df_col_name = f'{ranker_type}_{model_type}'
            ranking_df_key = f'{purchased_df_col_name}_{dataset_type}'

            model_options.append(ranking_df_key)

    # Generate all combinations
    all_combinations = []
    for r in range(1, len(model_options) + 1):
        all_combinations.extend(list(combinations(model_options, r)))

    all_combinations = [comb for comb in all_combinations if len(comb) > 1]

    for combination in all_combinations:
        ranks_df = None

        for model_key in combination:
            models_ranking_df = models_ranking_dict[model_key]
            if ranks_df is not None:
                ranks_df = pd.merge(ranks_df, models_ranking_df, left_on=['session_id', 'item_id'], right_on=['session_id', 'item_id'], how='outer').fillna(0)
                ranks_df['rank'] = ranks_df['rank_x'] + ranks_df['rank_y']
                ranks_df = ranks_df.drop(columns=['rank_x', 'rank_y'])
            else:
                ranks_df = models_ranking_df

        ranks_df['rank'] = ranks_df.groupby('session_id')['rank'].rank(method='first')
        ranks_df = ranks_df[ranks_df['rank'] <= 100]

        combined_models = '_&_'.join(combination)
        purchased_df = purchased_in_sessions_dict[dataset_type]
        purchased_df = pd.merge(purchased_df, ranks_df, left_on=['session_id', 'item_id'], right_on=['session_id', 'item_id'], how='left')
        purchased_df = purchased_df.rename(columns={'rank': combined_models})
        purchased_in_sessions_dict[dataset_type] = purchased_df

In [None]:
values_to_remove = ['session_id', 'item_id']

best_model = {}

for dataset_type in dataset_types:
    model_combinations = [x for x in purchased_in_sessions_dict[dataset_type].keys() if x not in values_to_remove]

    print('dataset type:', dataset_type)
    print()

    best_model_name = 'None'
    best_model_mrr = 0
    for model_combination in model_combinations:
        model_ranking = np.array(purchased_in_sessions_dict[dataset_type][model_combination])
        mrr = np.nansum(1 / model_ranking) / len(model_ranking)

        print(f'mrr for model {model_combination}:', mrr)

        if best_model_mrr < mrr:
            best_model_mrr = mrr
            best_model_name = model_combination

    best_model[dataset_type] = {
        'name': best_model_name,
        'mrr': best_model_mrr
    }

    print('---------------------------------------------------------------------------------------------------------------------------------------------')

dataset type: leaderboard

mrr for model xgboost_graph_and_lstm: 0.18970638849931315
mrr for model xgboost_all_traditional_and_lstm: 0.19261181912533581
mrr for model lightgbm_graph_and_lstm: 0.1918612865955328
mrr for model lightgbm_all_traditional_and_lstm: 0.19465534076005214
mrr for model catboost_graph_and_lstm: 0.19149389407521758
mrr for model xgboost_graph_and_lstm_leaderboard_&_xgboost_all_traditional_and_lstm_leaderboard: 0.18961156509107527
mrr for model xgboost_graph_and_lstm_leaderboard_&_lightgbm_graph_and_lstm_leaderboard: 0.1902036315294364
mrr for model xgboost_graph_and_lstm_leaderboard_&_lightgbm_all_traditional_and_lstm_leaderboard: 0.19069652369104065
mrr for model xgboost_graph_and_lstm_leaderboard_&_catboost_graph_and_lstm_leaderboard: 0.1901533160746555
mrr for model xgboost_all_traditional_and_lstm_leaderboard_&_lightgbm_graph_and_lstm_leaderboard: 0.19175721502532048
mrr for model xgboost_all_traditional_and_lstm_leaderboard_&_lightgbm_all_traditional_and_lstm

In [None]:
for dataset_type in dataset_types:
    print('dataset type:', dataset_type)
    print()
    print('best model:', best_model[dataset_type]['name'], '\nmrr:', best_model[dataset_type]['mrr'])
    print('---------------------------------------------------------------------------------------------------------------------------------------------')

dataset type: leaderboard

best model: lightgbm_all_traditional_and_lstm 
mrr: 0.19465534076005214
---------------------------------------------------------------------------------------------------------------------------------------------
dataset type: final

best model: lightgbm_all_traditional_and_lstm 
mrr: 0.19635226966331254
---------------------------------------------------------------------------------------------------------------------------------------------
