In [21]:
import numpy as np
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import torch
from config import model_name
from torch.utils.data import Dataset, DataLoader
from os import path
import sys
import pandas as pd
from ast import literal_eval
import importlib
from multiprocessing import Pool

try:
    Model = getattr(importlib.import_module(f"model.{model_name}"), model_name)
    config = getattr(importlib.import_module('config'), f"{model_name}Config")
except AttributeError:
    print(f"{model_name} not included!")
    exit()

device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

RESULT_CSV = 'results.csv'

In [22]:
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2**y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)


def value2rank(d):
    values = list(d.values())
    ranks = [sorted(values, reverse=True).index(x) for x in values]
    return {k: ranks[i] + 1 for i, k in enumerate(d.keys())}


class NewsDataset(Dataset):
    """
    Load news for evaluation.
    """
    def __init__(self, news_path):
        super(NewsDataset, self).__init__()
        self.news_parsed = pd.read_table(
            news_path,
            usecols=['id'] + config.dataset_attributes['news'],
            converters={
                attribute: literal_eval
                for attribute in set(config.dataset_attributes['news']) & set([
                    'title', 'abstract', 'title_entities', 'abstract_entities'
                ])
            })
        self.news2dict = self.news_parsed.to_dict('index')
        for key1 in self.news2dict.keys():
            for key2 in self.news2dict[key1].keys():
                if type(self.news2dict[key1][key2]) != str:
                    self.news2dict[key1][key2] = torch.tensor(
                        self.news2dict[key1][key2])

    def __len__(self):
        return len(self.news_parsed)

    def __getitem__(self, idx):
        item = self.news2dict[idx]
        return item


class UserDataset(Dataset):
    """
    Load users for evaluation, duplicated rows will be dropped
    """
    def __init__(self, behaviors_path, user2int_path):
        super(UserDataset, self).__init__()
        self.behaviors = pd.read_table(behaviors_path,
                                       header=None,
                                       usecols=[1, 3],
                                       names=['user', 'clicked_news'])
        self.behaviors.clicked_news.fillna(' ', inplace=True)
        self.behaviors.drop_duplicates(inplace=True)
        user2int = dict(pd.read_table(user2int_path).values.tolist())
        user_total = 0
        user_missed = 0
        for row in self.behaviors.itertuples():
            user_total += 1
            if row.user in user2int:
                self.behaviors.at[row.Index, 'user'] = user2int[row.user]
            else:
                user_missed += 1
                self.behaviors.at[row.Index, 'user'] = 0
        if model_name == 'LSTUR':
            print(f'User miss rate: {user_missed/user_total:.4f}')

    def __len__(self):
        return len(self.behaviors)

    def __getitem__(self, idx):
        row = self.behaviors.iloc[idx]
        item = {
            "user":
            row.user,
            "clicked_news_string":
            row.clicked_news,
            "clicked_news":
            row.clicked_news.split()[:config.num_clicked_news_a_user]
        }
        item['clicked_news_length'] = len(item["clicked_news"])
        repeated_times = config.num_clicked_news_a_user - len(
            item["clicked_news"])
        assert repeated_times >= 0
        item["clicked_news"] = ['PADDED_NEWS'
                                ] * repeated_times + item["clicked_news"]

        return item


class BehaviorsDataset(Dataset):
    """
    Load behaviors for evaluation, (user, time) pair as session
    """
    def __init__(self, behaviors_path):
        super(BehaviorsDataset, self).__init__()
        self.behaviors = pd.read_table(behaviors_path,
                                       header=None,
                                       usecols=range(5),
                                       names=[
                                           'impression_id', 'user', 'time',
                                           'clicked_news', 'impressions'
                                       ])
        self.behaviors.clicked_news.fillna(' ', inplace=True)
        self.behaviors.impressions = self.behaviors.impressions.str.split()

    def __len__(self):
        return len(self.behaviors)

    def __getitem__(self, idx):
        row = self.behaviors.iloc[idx]
        item = {
            "impression_id": row.impression_id,
            "user": row.user,
            "time": row.time,
            "clicked_news_string": row.clicked_news,
            "impressions": row.impressions
        }
        return item


def calculate_single_user_metric(pair):
    try:
        auc = roc_auc_score(*pair)
        mrr = mrr_score(*pair)
        ndcg5 = ndcg_score(*pair, 5)
        ndcg10 = ndcg_score(*pair, 10)
        return [auc, mrr, ndcg5, ndcg10]
    except ValueError:
        return [np.nan] * 4


@torch.no_grad()
def evaluate(model, directory, num_workers, max_count=sys.maxsize, mode='test'):
    """
    Evaluate model on target directory.
    Args:
        model: model to be evaluated
        directory: the directory that contains two files (behaviors.tsv, news_parsed.tsv)
        num_workers: processes number for calculating metrics
    Returns:
        AUC
        MRR
        nDCG@5
        nDCG@10
    """
    news_dataset = NewsDataset(path.join(directory, 'news_parsed.tsv'))
    news_dataloader = DataLoader(news_dataset,
                                 batch_size=config.batch_size * 16,
                                 shuffle=False,
                                 num_workers=config.num_workers,
                                 drop_last=False,
                                 pin_memory=True)

    news2vector = {}
    for minibatch in tqdm(news_dataloader,
                          desc="Calculating vectors for news"):
        news_ids = minibatch["id"]
        if any(id not in news2vector for id in news_ids):
            news_vector = model.get_news_vector(minibatch)
            for id, vector in zip(news_ids, news_vector):
                if id not in news2vector:
                    news2vector[id] = vector

    news2vector['PADDED_NEWS'] = torch.zeros(
        list(news2vector.values())[0].size())

    user_dataset = UserDataset(path.join(directory, 'behaviors.tsv'),
                               'data/train/user2int.tsv')
    user_dataloader = DataLoader(user_dataset,
                                 batch_size=config.batch_size * 16,
                                 shuffle=False,
                                 num_workers=config.num_workers,
                                 drop_last=False,
                                 pin_memory=True)

    user2vector = {}
    for minibatch in tqdm(user_dataloader,
                          desc="Calculating vectors for users"):
        user_strings = minibatch["clicked_news_string"]
        if any(user_string not in user2vector for user_string in user_strings):
            clicked_news_vector = torch.stack([
                torch.stack([news2vector[x].to(device) for x in news_list],
                            dim=0) for news_list in minibatch["clicked_news"]
            ],
                                              dim=0).transpose(0, 1)
            if model_name == 'LSTUR':
                user_vector = model.get_user_vector(
                    minibatch['user'], minibatch['clicked_news_length'],
                    clicked_news_vector)
            else:
                user_vector = model.get_user_vector(clicked_news_vector)
            for user, vector in zip(user_strings, user_vector):
                if user not in user2vector:
                    user2vector[user] = vector

    behaviors_dataset = BehaviorsDataset(path.join(directory, 'behaviors.tsv'))
    behaviors_dataloader = DataLoader(behaviors_dataset,
                                      batch_size=1,
                                      shuffle=False,
                                      num_workers=config.num_workers)

    count = 0

    tasks = []
    result_dict = {}

    for minibatch in tqdm(behaviors_dataloader,
                          desc="Calculating probabilities"):
        count += 1
        if count == max_count:
            break

        candidate_news_vector = torch.stack([
            news2vector[news[0].split('-')[0]]
            for news in minibatch['impressions']
        ],
                                            dim=0)
        user_vector = user2vector[minibatch['clicked_news_string'][0]]
        click_probability = model.get_prediction(candidate_news_vector,
                                                 user_vector)

        y_pred = click_probability.tolist()
        if mode == 'train':
            y_true = [
                int(news[0].split('-')[1]) for news in minibatch['impressions']
            ]
            tasks.append((y_true, y_pred))
        elif mode == 'test':
            result_dict[f'{count-1}'] = y_pred

    if mode == 'train':
        with Pool(processes=num_workers) as pool:
            results = pool.map(calculate_single_user_metric, tasks)

        aucs, mrrs, ndcg5s, ndcg10s = np.array(results).T
        return np.nanmean(aucs), np.nanmean(mrrs), np.nanmean(ndcg5s), np.nanmean(
            ndcg10s)
    elif mode == 'test':
        return result_dict


In [23]:
print('Using device:', device)
print(f'Evaluating model {model_name}')
# Don't need to load pretrained word/entity/context embedding
# since it will be loaded from checkpoint later
model = Model(config).to(device)
from train import latest_checkpoint  # Avoid circular imports
checkpoint_path = latest_checkpoint(path.join('./checkpoint', model_name))
if checkpoint_path is None:
    print('No checkpoint file found!')
    exit()
print(f"Load saved parameters in {checkpoint_path}")
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
auc, mrr, ndcg5, ndcg10 = evaluate(model, './data/val',
                                   config.num_workers, mode='train')
print(
    f'AUC: {auc:.4f}\nMRR: {mrr:.4f}\nnDCG@5: {ndcg5:.4f}\nnDCG@10: {ndcg10:.4f}'
)

y_preds = evaluate(model, './data/test', config.num_workers, mode='test')

Using device: cuda:3
Evaluating model NRMS
Load saved parameters in ./checkpoint/NRMS/ckpt-7000.pth


Calculating vectors for news:   0%|          | 0/50 [00:00<?, ?it/s]

Calculating vectors for users:   0%|          | 0/105 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/285297 [00:00<?, ?it/s]

AUC: 0.7214
MRR: 0.3908
nDCG@5: 0.4670
nDCG@10: 0.5635


Calculating vectors for news:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating vectors for users:   0%|          | 0/22 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/46332 [00:00<?, ?it/s]

In [24]:
results_to_submit = pd.DataFrame(y_preds).T
results_to_submit.to_csv(
  'results.csv',
  header=["p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"],
  index_label='index'
  )

# results_to_submit.columns = ["p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"]

# results_to_submit

pd.read_csv(RESULT_CSV, )

Unnamed: 0,index,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15
0,0,1.183794,-0.797354,-0.536473,2.165243,-1.153278,0.298964,0.202679,0.011402,-0.706279,-0.686258,-0.701948,-0.136578,-0.735354,0.078419,0.946975
1,1,0.206575,-0.225023,-0.013934,-0.543442,0.342222,-0.418650,0.058411,-0.142107,0.327512,0.308264,-0.313714,0.035650,-0.097447,-0.326808,-0.135655
2,2,0.073174,0.357276,0.168952,-0.342357,-0.113186,-0.101083,-0.079423,-0.317790,0.096257,0.160641,0.187640,-0.139982,0.153774,-0.267926,-0.383316
3,3,0.258743,-0.202310,-0.040910,0.029026,-0.165550,0.129389,-0.019053,-0.112855,-0.392836,-0.017779,-0.301390,0.012729,0.085155,-0.301782,0.120205
4,4,-1.103027,0.468785,-0.191371,0.376598,-0.245778,-0.015998,-0.164391,0.301599,-0.297742,-1.032542,-1.205259,-0.276209,0.342670,0.125423,-0.317054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46327,46327,-0.458541,-0.149874,0.318461,0.045217,-0.128579,0.131929,-0.297343,-0.279712,-0.204995,-0.005381,-0.285336,0.289681,0.048688,-0.187876,-0.360425
46328,46328,-0.224928,-0.050704,-0.032955,0.035137,-0.103648,0.018052,-0.053892,-0.038986,-0.062810,0.036746,-0.083936,0.264135,-0.352102,-0.076280,0.016065
46329,46329,-0.306582,-0.486910,1.141995,0.058732,-0.785894,0.585979,-0.485143,0.024705,1.272189,-0.688220,0.417800,-0.340864,-0.777608,0.144943,-0.004685
46330,46330,0.576961,-0.886301,-0.066326,0.387231,-0.802487,-0.799435,-0.278686,-0.222543,-0.072449,-0.762129,-1.171261,-0.125202,-0.942808,-0.049369,-0.323195


### 自定義需要的log

In [25]:
from config import BaseConfig

num_epochs = BaseConfig.num_epochs
# Number of batchs to show loss
num_batches_show_loss = BaseConfig.num_batches_show_loss
# Number of batchs to check metrics on validation dataset
num_batches_validate = BaseConfig.num_batches_validate
batch_size = BaseConfig.batch_size
learning_rate = BaseConfig.learning_rate
# Number of workers for data loading
num_workers = BaseConfig.num_workers
# Number of sampled click history for each user
num_clicked_news_a_user = BaseConfig.num_clicked_news_a_user
num_words_title = BaseConfig.num_words_title
num_words_abstract = BaseConfig.num_words_abstract
word_freq_threshold = BaseConfig.word_freq_threshold
entity_freq_threshold = BaseConfig.entity_freq_threshold
entity_confidence_threshold = BaseConfig.entity_confidence_threshold
# K
negative_sampling_ratio = BaseConfig.negative_sampling_ratio
dropout_probability = BaseConfig.dropout_probability
# Modify the following by the output of `src/dataprocess.py`
num_words = BaseConfig.num_words
num_categories = BaseConfig.num_categories
num_entities = BaseConfig.num_entities
num_users = BaseConfig.num_users
word_embedding_dim = BaseConfig.word_embedding_dim
category_embedding_dim = BaseConfig.category_embedding_dim
# Modify the following only if you use another dataset
entity_embedding_dim = BaseConfig.entity_embedding_dim
# For additive attention
query_vector_dim = BaseConfig.query_vector_dim

In [45]:
EXTRA_MSG: str = ('' + \
  # f'SMOTE+RANDOM stacking ' + \

  # f'take away age>=90 from training data ' + \
  # f'ratio=(8, 2) ' + \
  # f'with normalization ({norm_mode=}) ' + \
  # f'Logistic Regression!' + \
  '')

# if REMOVE_MISMATCH:
#   EXTRA_MSG += f' | {REMOVE_MISMATCH=}, '
# if REFINE_CAPITAL_DIFF:
#   EXTRA_MSG += f' | {REFINE_CAPITAL_DIFF=}, '
# if REFINE_AGE:
#   EXTRA_MSG += f' | {REFINE_AGE=}, '
# if REFINE_HPWEEK:
#   EXTRA_MSG += f' | {REFINE_HPWEEK=}, '
# if REFINE_RACE:
#   EXTRA_MSG += f' | {REFINE_RACE=}, '

log = (
  f"kaggle competitions submit -c 2023-datamining-final-project -f {RESULT_CSV} -m "
  # f'''"Features: {best_config['feature']}. INFO: '''
  f'''"AUC: {auc:.4f}, MRR: {mrr:.4f}, nDCG@5: {ndcg5:.4f}, nDCG@10: {ndcg10:.4f}''' 
  # [Acc={acc:.4f}, iteration={best_config['iteration']}, lr={best_config['lr']:.6f}, {l2_lambda=:.3f}] 
  f'''  *EXTRA: [{EXTRA_MSG}]."'''
)
print(log)

kaggle competitions submit -c 2023-datamining-final-project -f results.csv -m "AUC: 0.7214, MRR: 0.3908, nDCG@5: 0.4670, nDCG@10: 0.5635  *EXTRA: []."


### Submmit to the Kaggle

In [46]:
# For safty.
import os
# raise KeyError('Are you sure you want to submit the result?')
_ = os.system(log)

100%|██████████| 13.6M/13.6M [00:03<00:00, 3.66MB/s]


Successfully submitted to 2023 Data Mining Final Project