In [46]:
import numpy as np
from sklearn.metrics import roc_auc_score
# from tqdm import tqdm
from tqdm.notebook import tqdm
import torch
from config import model_name
from torch.utils.data import Dataset, DataLoader
from os import path
import sys
import pandas as pd
from ast import literal_eval
import importlib
from multiprocessing import Pool

# model_name: str = 'NRMS'
# model_name: str = 'NAML'
# model_name: str = 'TANR'
# model_name: str = 'LSTUR'
# model_name: str = 'DKN'
# model_name: str = 'HiFiArk'
# model_name: str = 'Exp1'

try:
    Model = getattr(importlib.import_module(f"model.{model_name}"), model_name)
    config = getattr(importlib.import_module('config'), f"{model_name}Config")
except AttributeError:
    print(f"{model_name} not included!")
    exit()

device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

RESULT_CSV = 'results.csv'
norm = lambda x: (x-np.min(x)) / (np.max(x)-np.min(x))

In [47]:
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2**y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)


def value2rank(d):
    values = list(d.values())
    ranks = [sorted(values, reverse=True).index(x) for x in values]
    return {k: ranks[i] + 1 for i, k in enumerate(d.keys())}


class NewsDataset(Dataset):
    """
    Load news for evaluation.
    """
    def __init__(self, news_path):
        super(NewsDataset, self).__init__()
        self.news_parsed = pd.read_table(
            news_path,
            usecols=['id'] + config.dataset_attributes['news'],
            converters={
                attribute: literal_eval
                for attribute in set(config.dataset_attributes['news']) & set([
                    'title', 'abstract', 'title_entities', 'abstract_entities'
                ])
            })
        self.news2dict = self.news_parsed.to_dict('index')
        for key1 in self.news2dict.keys():
            for key2 in self.news2dict[key1].keys():
                if type(self.news2dict[key1][key2]) != str:
                    self.news2dict[key1][key2] = torch.tensor(
                        self.news2dict[key1][key2])

    def __len__(self):
        return len(self.news_parsed)

    def __getitem__(self, idx):
        item = self.news2dict[idx]
        return item


class UserDataset(Dataset):
    """
    Load users for evaluation, duplicated rows will be dropped
    """
    def __init__(self, behaviors_path, user2int_path):
        super(UserDataset, self).__init__()
        self.behaviors = pd.read_table(behaviors_path,
                                       header=None,
                                       usecols=[1, 3],
                                       names=['user', 'clicked_news'])
        self.behaviors.clicked_news.fillna(' ', inplace=True)
        self.behaviors.drop_duplicates(inplace=True)
        user2int = dict(pd.read_table(user2int_path).values.tolist())
        user_total = 0
        user_missed = 0
        for row in self.behaviors.itertuples():
            user_total += 1
            if row.user in user2int:
                self.behaviors.at[row.Index, 'user'] = user2int[row.user]
            else:
                user_missed += 1
                self.behaviors.at[row.Index, 'user'] = 0
        if model_name == 'LSTUR':
            print(f'User miss rate: {user_missed/user_total:.4f}')

    def __len__(self):
        return len(self.behaviors)

    def __getitem__(self, idx):
        row = self.behaviors.iloc[idx]
        item = {
            "user":
            row.user,
            "clicked_news_string":
            row.clicked_news,
            "clicked_news":
            row.clicked_news.split()[:config.num_clicked_news_a_user]
        }
        item['clicked_news_length'] = len(item["clicked_news"])
        repeated_times = config.num_clicked_news_a_user - len(
            item["clicked_news"])
        assert repeated_times >= 0
        item["clicked_news"] = ['PADDED_NEWS'
                                ] * repeated_times + item["clicked_news"]

        return item


class BehaviorsDataset(Dataset):
    """
    Load behaviors for evaluation, (user, time) pair as session
    """
    def __init__(self, behaviors_path):
        super(BehaviorsDataset, self).__init__()
        self.behaviors = pd.read_table(behaviors_path,
                                       header=None,
                                       usecols=range(5),
                                       names=[
                                           'impression_id', 'user', 'time',
                                           'clicked_news', 'impressions'
                                       ])
        self.behaviors.clicked_news.fillna(' ', inplace=True)
        self.behaviors.impressions = self.behaviors.impressions.str.split()

    def __len__(self):
        return len(self.behaviors)

    def __getitem__(self, idx):
        row = self.behaviors.iloc[idx]
        item = {
            "impression_id": row.impression_id,
            "user": row.user,
            "time": row.time,
            "clicked_news_string": row.clicked_news,
            "impressions": row.impressions
        }
        return item


def calculate_single_user_metric(pair):
    try:
        auc = roc_auc_score(*pair)
        mrr = mrr_score(*pair)
        ndcg5 = ndcg_score(*pair, 5)
        ndcg10 = ndcg_score(*pair, 10)
        return [auc, mrr, ndcg5, ndcg10]
    except ValueError:
        return [np.nan] * 4


@torch.no_grad()
def evaluate(model, directory, num_workers, max_count=sys.maxsize, mode='test'):
    """
    Evaluate model on target directory.
    Args:
        model: model to be evaluated
        directory: the directory that contains two files (behaviors.tsv, news_parsed.tsv)
        num_workers: processes number for calculating metrics
    Returns:
        AUC
        MRR
        nDCG@5
        nDCG@10
    """
    news_dataset = NewsDataset(path.join(directory, 'news_parsed.tsv'))
    news_dataloader = DataLoader(news_dataset,
                                 batch_size=config.batch_size * 16,
                                 shuffle=False,
                                 num_workers=config.num_workers,
                                 drop_last=False,
                                 pin_memory=True)

    news2vector = {}
    for minibatch in tqdm(news_dataloader,
                          desc="Calculating vectors for news"):
        news_ids = minibatch["id"]
        if any(id not in news2vector for id in news_ids):
            news_vector = model.get_news_vector(minibatch)
            for id, vector in zip(news_ids, news_vector):
                if id not in news2vector:
                    news2vector[id] = vector

    news2vector['PADDED_NEWS'] = torch.zeros(
        list(news2vector.values())[0].size())

    user_dataset = UserDataset(path.join(directory, 'behaviors.tsv'),
                               'data/train/user2int.tsv')
    user_dataloader = DataLoader(user_dataset,
                                 batch_size=config.batch_size * 16,
                                 shuffle=False,
                                 num_workers=config.num_workers,
                                 drop_last=False,
                                 pin_memory=True)

    user2vector = {}
    for minibatch in tqdm(user_dataloader,
                          desc="Calculating vectors for users"):
        user_strings = minibatch["clicked_news_string"]
        if any(user_string not in user2vector for user_string in user_strings):
            clicked_news_vector = torch.stack([
                torch.stack([news2vector[x].to(device) for x in news_list],
                            dim=0) for news_list in minibatch["clicked_news"]
            ],
                                              dim=0).transpose(0, 1)
            if model_name == 'LSTUR':
                user_vector = model.get_user_vector(
                    minibatch['user'], minibatch['clicked_news_length'],
                    clicked_news_vector)
            else:
                user_vector = model.get_user_vector(clicked_news_vector)
            for user, vector in zip(user_strings, user_vector):
                if user not in user2vector:
                    user2vector[user] = vector

    behaviors_dataset = BehaviorsDataset(path.join(directory, 'behaviors.tsv'))
    behaviors_dataloader = DataLoader(behaviors_dataset,
                                      batch_size=1,
                                      shuffle=False,
                                      num_workers=config.num_workers)

    count = 0

    tasks = []
    result_dict = {}

    for minibatch in tqdm(behaviors_dataloader,
                          desc="Calculating probabilities"):
        count += 1
        if count == max_count:
            break

        candidate_news_vector = torch.stack([
            news2vector[news[0].split('-')[0]]
            for news in minibatch['impressions']
        ],
                                            dim=0)
        user_vector = user2vector[minibatch['clicked_news_string'][0]]
        click_probability = model.get_prediction(candidate_news_vector,
                                                 user_vector)

        y_pred = click_probability.tolist()
        if mode == 'train':
            y_true = [
                int(news[0].split('-')[1]) for news in minibatch['impressions']
            ]
            tasks.append((y_true, y_pred))
        elif mode == 'test':
            result_dict[f'{count-1}'] = norm(y_pred)

    if mode == 'train':
        with Pool(processes=num_workers) as pool:
            results = pool.map(calculate_single_user_metric, tasks)

        aucs, mrrs, ndcg5s, ndcg10s = np.array(results).T
        return np.nanmean(aucs), np.nanmean(mrrs), np.nanmean(ndcg5s), np.nanmean(
            ndcg10s)
    elif mode == 'test':
        return result_dict


In [48]:
print('Using device:', device)
print(f'Evaluating model {model_name}')
# Don't need to load pretrained word/entity/context embedding
# since it will be loaded from checkpoint later
model = Model(config).to(device)
from train import latest_checkpoint  # Avoid circular imports
checkpoint_path = latest_checkpoint(path.join('./checkpoint', model_name))
if checkpoint_path is None:
    print('No checkpoint file found!')
    exit()
print(f"Load saved parameters in {checkpoint_path}")
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
auc, mrr, ndcg5, ndcg10 = evaluate(model, './data/val',
                                   config.num_workers, mode='train')
print(
    f'AUC: {auc:.4f}\nMRR: {mrr:.4f}\nnDCG@5: {ndcg5:.4f}\nnDCG@10: {ndcg10:.4f}'
)

y_preds = evaluate(model, './data/test', config.num_workers, mode='test')

Using device: cuda:3
Evaluating model TANR
Load saved parameters in ./checkpoint/TANR/ckpt-8000.pth


Calculating vectors for news:   0%|          | 0/25 [00:00<?, ?it/s]

Calculating vectors for users:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/28531 [00:00<?, ?it/s]

AUC: 0.7373
MRR: 0.4098
nDCG@5: 0.4903
nDCG@10: 0.5820


Calculating vectors for news:   0%|          | 0/18 [00:00<?, ?it/s]

Calculating vectors for users:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/46332 [00:00<?, ?it/s]

In [49]:
results_to_submit = pd.DataFrame(y_preds).T
# results_to_submit.columns = ["p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"]
# results_to_submit

results_to_submit.to_csv(
  'results.csv',
  header=["p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"],
  index_label='index'
  )
pd.read_csv(RESULT_CSV, )

Unnamed: 0,index,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15
0,0,0.965313,0.204751,0.035420,1.000000,0.159142,0.000000,0.124784,0.151610,0.136660,0.193609,0.398131,0.265642,0.128594,0.372625,0.641762
1,1,0.556415,0.373567,0.358529,0.000000,0.270856,0.229947,0.308473,0.762070,1.000000,0.600530,0.113563,0.383396,0.488201,0.440063,0.491090
2,2,0.752334,1.000000,0.607291,0.002380,0.338792,0.193911,0.555606,0.072123,0.530488,0.739669,0.158514,0.474470,0.496779,0.268669,0.000000
3,3,0.494238,0.235841,0.402977,0.228699,1.000000,0.438163,0.311420,0.381508,0.000000,0.377915,0.207145,0.268098,0.249367,0.330327,0.658996
4,4,0.054635,1.000000,0.309762,0.682028,0.552010,0.589548,0.633458,0.907023,0.952393,0.000000,0.071843,0.415229,0.920338,0.928573,0.661943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46327,46327,0.062091,0.157472,1.000000,0.739967,0.436768,0.390147,0.000000,0.424773,0.448743,0.467676,0.224073,0.919485,0.489150,0.112152,0.070785
46328,46328,0.000000,0.543937,0.791482,0.214576,0.721250,0.214195,0.519369,0.569935,0.960954,0.417863,0.496947,1.000000,0.314287,0.098834,0.193188
46329,46329,0.390703,0.000000,1.000000,0.608238,0.075430,0.558741,0.310884,0.317546,0.972612,0.187740,0.406213,0.111873,0.069026,0.454600,0.437820
46330,46330,0.885707,0.044853,0.369273,0.725043,0.563176,0.103416,0.445261,0.927044,0.561305,0.000000,0.439365,0.863669,0.277340,0.582923,1.000000


### 自定義需要的log

In [50]:
from config import BaseConfig

num_epochs = BaseConfig.num_epochs
# Number of batchs to show loss
num_batches_show_loss = BaseConfig.num_batches_show_loss
# Number of batchs to check metrics on validation dataset
num_batches_validate = BaseConfig.num_batches_validate
batch_size = BaseConfig.batch_size
learning_rate = BaseConfig.learning_rate
# Number of workers for data loading
num_workers = BaseConfig.num_workers
# Number of sampled click history for each user
num_clicked_news_a_user = BaseConfig.num_clicked_news_a_user
num_words_title = BaseConfig.num_words_title
num_words_abstract = BaseConfig.num_words_abstract
word_freq_threshold = BaseConfig.word_freq_threshold
entity_freq_threshold = BaseConfig.entity_freq_threshold
entity_confidence_threshold = BaseConfig.entity_confidence_threshold
# K
negative_sampling_ratio = BaseConfig.negative_sampling_ratio
dropout_probability = BaseConfig.dropout_probability
# Modify the following by the output of `src/dataprocess.py`
num_words = BaseConfig.num_words
num_categories = BaseConfig.num_categories
num_entities = BaseConfig.num_entities
num_users = BaseConfig.num_users
word_embedding_dim = BaseConfig.word_embedding_dim
category_embedding_dim = BaseConfig.category_embedding_dim
# Modify the following only if you use another dataset
entity_embedding_dim = BaseConfig.entity_embedding_dim
# For additive attention
query_vector_dim = BaseConfig.query_vector_dim

In [51]:
EXTRA_MSG: str = ('' + \
  # f'SMOTE+RANDOM stacking ' + \
  f'{num_epochs=}, '
  f'{batch_size=}, '
  f'{learning_rate=}, '
  f'{num_clicked_news_a_user=}, '
  f'{num_words_title=}, '
  f'{num_words_abstract=}, '
  f'{word_freq_threshold=}, '
  f'{entity_freq_threshold=}, '
  f'{entity_confidence_threshold=}, '
  f'{negative_sampling_ratio=}, '
  f'{dropout_probability=}, '
  # f'take away age>=90 from training data ' + \
  # f'ratio=(8, 2) ' + \
  # f'with normalization ({norm_mode=}) ' + \
  # f'Logistic Regression!' + \
  '')

# if REMOVE_MISMATCH:
#   EXTRA_MSG += f' | {REMOVE_MISMATCH=}, '
# if REFINE_CAPITAL_DIFF:
#   EXTRA_MSG += f' | {REFINE_CAPITAL_DIFF=}, '
# if REFINE_AGE:
#   EXTRA_MSG += f' | {REFINE_AGE=}, '
# if REFINE_HPWEEK:
#   EXTRA_MSG += f' | {REFINE_HPWEEK=}, '
# if REFINE_RACE:
#   EXTRA_MSG += f' | {REFINE_RACE=}, '

log = (
  f"kaggle competitions submit -c 2023-datamining-final-project -f {RESULT_CSV} -m "
  # f'''"Features: {best_config['feature']}. INFO: '''
  f'''"[{model_name}] AUC: {auc:.4f}, MRR: {mrr:.4f}, nDCG@5: {ndcg5:.4f}, nDCG@10: {ndcg10:.4f}''' 
  # [Acc={acc:.4f}, iteration={best_config['iteration']}, lr={best_config['lr']:.6f}, {l2_lambda=:.3f}] 
  f'''  *EXTRA: [{EXTRA_MSG}]."'''
)
print(log)

kaggle competitions submit -c 2023-datamining-final-project -f results.csv -m "[TANR] AUC: 0.7373, MRR: 0.4098, nDCG@5: 0.4903, nDCG@10: 0.5820  *EXTRA: [num_epochs=5, batch_size=256, learning_rate=0.0001, num_clicked_news_a_user=50, num_words_title=20, num_words_abstract=50, word_freq_threshold=1, entity_freq_threshold=2, entity_confidence_threshold=0.5, negative_sampling_ratio=2, dropout_probability=0.2, ]."


### Submmit to the Kaggle

In [53]:
# For safty.
import os
raise KeyError('Are you sure you want to submit the result?')
_ = os.system(log)

100%|██████████| 11.7M/11.7M [00:04<00:00, 2.82MB/s]


Successfully submitted to 2023 Data Mining Final Project