In [None]:
import argparse
import os

import numpy as np
import pandas as pd
import pytorch_lightning as pl

from _my_lightning_modules import ReviewDataModule, ReviewRegressionNet

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", type=str, required=True)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument("--ckpt_file", type=str)

parser.add_argument(
    "--model_name", type=str, default="cl-tohoku/bert-base-japanese"
)
parser.add_argument("--batch_size", type=int, default=16)
parser.add_argument("--gpu", type=int, default=0)

EXPERIMENT_NAME="predict_helpful_votes"
RUN_NAME="cl-tohoku_bert-base-japanese_lr1e-5"

args_list = ["--input_file", "../data/preprocessing_shared/training-val.jsonl", \
            "--output_dir", "../data/predict/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/", \
            "--ckpt_file", "../data/train/model/"+EXPERIMENT_NAME+"/"+RUN_NAME+".ckpt"]
# args_list = ["--input_file", "../data/dataset_shared_initial/leader_board.jsonl", \
#             "--output_dir", "../data/predict/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/", \
#             "--ckpt_file", "../data/train/model/"+EXPERIMENT_NAME+"/"+RUN_NAME+".ckpt"]
# args_list = ["--input_file", "../data/dataset_shared/final_result.jsonl", \
#             "--output_dir", "../data/predict/"+EXPERIMENT_NAME+"/"+RUN_NAME+"/", \
#             "--ckpt_file", "../data/train/model/"+EXPERIMENT_NAME+"/"+RUN_NAME+".ckpt"]
args = parser.parse_args(args_list)

In [None]:
if not os.path.isdir(args.output_dir):
    os.makedirs(args.output_dir)

In [None]:
dm = ReviewDataModule(args)
net = ReviewRegressionNet(args)
trainer = pl.Trainer(gpus=[args.gpu], logger=False)

In [None]:
if args.ckpt_file is None:
    pred = trainer.predict(net, dm)
else:
    pred = trainer.predict(net, dm, ckpt_path=args.ckpt_file)

In [None]:
df = pd.read_json(args.input_file, orient="records", lines=True)
df.loc[:, "pred"] = sum([list(p.numpy().flatten()) for p in pred], [])
df.loc[:, "pred_helpful_votes"] = df["pred"].apply(lambda x: np.exp(x) - 1)

output_file = args.output_dir + args.input_file.split("/")[-1]
df.to_json(output_file, orient="records", force_ascii=False, lines=True)