In [None]:
import argparse
import json
import os

import pandas as pd
from sklearn.metrics import ndcg_score

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", type=str, required=True)
parser.add_argument("--output_dir", type=str, required=True)

In [None]:
EXPERIMENT_NAME="predict_helpful_votes"
RUN_NAME="cl-tohoku_bert-base-japanese_lr1e-5"

args_list = ["--input_file", "../data/predict/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/training-val.jsonl", \
            "--output_dir", "../data/evaluation/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/"]
# args_list = ["--input_file", "../data/predict/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/leader_board.jsonl", \
#             "--output_dir", "../data/evaluation/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/"]
# args_list = ["--input_file", "../data/predict/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/final_result.jsonl", \
#             "--output_dir", "../data/evaluation/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/"]
args = parser.parse_args(args_list)

In [None]:
if not os.path.isdir(args.output_dir):
    os.makedirs(args.output_dir)

In [None]:
df = pd.read_json(args.input_file, orient="records", lines=True)

In [None]:
def convert_to_submit_format(df, score_column, mode="pred"):
    output_list = []
    for product_idx in sorted(set(df["product_idx"])):
        df_product = df[df["product_idx"] == product_idx]
        scores = [
            {"review_idx": i, mode + "_score": s}
            for i, s in zip(df_product["review_idx"], df_product[score_column])
        ]
        output_list.append({"product_idx": product_idx, mode + "_list": scores})
    return pd.DataFrame(output_list)

In [None]:
df_pred = convert_to_submit_format(df, "pred_helpful_votes", "pred")
output_pred_file = args.output_dir + "submit_" + args.input_file.split("/")[-1]
df_pred.to_json(output_pred_file, orient="records", force_ascii=False, lines=True)

In [None]:
if "helpful_votes" in df.columns:
    df_true = convert_to_submit_format(df, "helpful_votes", "true")
    df_merge = pd.merge(df_pred, df_true, on="product_idx")

    sum_ndcg = 0
    for df_dict in df_merge.to_dict("records"):
        df_eval = pd.merge(
            pd.DataFrame(df_dict["pred_list"]),
            pd.DataFrame(df_dict["true_list"]),
            on="review_idx",
        )
        ndcg = ndcg_score([df_eval["true_score"]], [df_eval["pred_score"]], k=5)
        sum_ndcg += ndcg

    output_eval_file = (
        args.output_dir
        + "eval_"
        + args.input_file.split("/")[-1].replace(".jsonl", ".json")
    )
    with open(output_eval_file, "w") as f:
        json.dump(
            {"ndcg@5": sum_ndcg / len(df_merge)}, f, indent=4, ensure_ascii=False
        )