In [1]:
import pandas as pd
import lightgbm
import numpy as np
import math
import os

from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
    precision_recall_curve,
    log_loss,
)
from sklearn.model_selection import StratifiedKFold, GroupKFold
from typing import List, Dict

import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm import LGBMRanker

from tqdm import tqdm

import json
from bs4 import BeautifulSoup
import optuna

from catboost import CatBoostClassifier, Pool, CatBoostRanker
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

  from optuna import progress_bar as pbar_module


In [2]:
output_dir = "../output/feat_without_emb"

In [3]:
class Config:
    seed = 42
    base_dir = f"../data/"

    # dblp
    train_dblp_path = f"{base_dir}/df_train_dblp.csv"
    test_pub_dblp_path = f"{base_dir}/df_test_pub_dblp.csv"

    # context
    train_context_path = f"{base_dir}/train_context.csv"
    train_context_filled_path = f"{base_dir}/train_context_filled_citation.csv"
    test_pub_context_path = f"{base_dir}/test_pub_gen_context_filled_citation.csv"

    # scibert encode
    train_abstract_title_vector_path = f"{base_dir}/train_title_abstract_vector.csv"
    test_pub_abstract_title_vector_path = f"{base_dir}/test_pub_title_abstract_vector.csv"

    train_abstract_vector_path = f"{base_dir}/train_abstract_vector.csv"
    test_pub_abstract_vector_path = f"{base_dir}/test_pub_abstract_vector.csv"

    train_title_vector_path = f"{base_dir}/train_title_vector.csv"
    test_pub_title_vector_path = f"{base_dir}/test_pub_title_vector.csv"

    train_ref_title_ref_abstract_vector_path = (
        f"{base_dir}/train_ref_title_ref_abstract_vector.csv"
    )
    test_pub_ref_title_ref_abstract_vector_path = (
        f"{base_dir}/test_pub_ref_title_ref_abstract_vector.csv"
    )

    train_ref_title_vector_path = f"{base_dir}/train_ref_title_vector.csv"
    test_pub_ref_title_vector_path = f"{base_dir}/test_pub_ref_title_vector.csv"

    train_ref_abstract_vector_path = f"{base_dir}/train_ref_abstract_vector.csv"
    test_pub_ref_abstract_vector_path = f"{base_dir}/test_pub_ref_abstract_vector.csv"

    train_context_vector_path = f"{base_dir}/train_context_vector.csv"
    test_pub_context_vector_path = f"{base_dir}/test_pub_context_vector.csv"

    train_scibert_sim_vector_path = f"{base_dir}/train_scibert_sim.csv"
    test_pub_scibert_sim_vector_path = f"{base_dir}/test_pub_scibert_sim.csv"

    train_keyword_vector_path = f"{base_dir}/train_keywords_vector.csv"
    test_pub_keyword_vector_path = f"{base_dir}/test_pub_keywords_vector.csv"

    train_ref_keyword_vector_path = f"{base_dir}/train_ref_keywords_vector.csv"
    test_pub_ref_keyword_vector_path = f"{base_dir}/test_pub_ref_keywords_vector.csv"

    train_org_vector_path = f"{base_dir}/train_org_vector.csv"
    test_pub_org_vector_path = f"{base_dir}/test_pub_org_vector.csv"

    train_ref_org_vector_path = f"{base_dir}/train_ref_org_vector.csv"
    test_pub_ref_org_vector_path = f"{base_dir}/test_pub_ref_org_vector.csv"

    train_venue_vector_path = f"{base_dir}/train_venue_vector.csv"
    test_pub_venue_vector_path = f"{base_dir}/test_pub_venue_vector.csv"

    train_ref_venue_vector_path = f"{base_dir}/train_ref_venue_vector.csv"
    test_pub_ref_venue_vector_path = f"{base_dir}/test_pub_ref_venue_vector.csv"

    train_title_abst_context_kw_vector_path = (
        f"{base_dir}/train_title_abstract_context_keyword_vector.csv"
    )
    test_pub_title_abst_context_kw_vector_path = (
        f"{base_dir}/test_pub_title_abstract_context_keyword_vector.csv"
    )

    train_ref_title_ref_abstract_ref_kw_vector_path = (
        f"{base_dir}/train_ref_title_ref_abstract_ref_keyword_vector.csv"
    )
    test_pub_ref_title_ref_abstract_ref_kw_vector_path = (
        f"{base_dir}/test_pub_ref_title_ref_abstract_ref_keyword_vector.csv"
    )

    # scibert score
    train_ce_score_path = (
        f"../ce/default/oof_pred_scibert.csv"
    )
    test_pub_ce_score_path = (
        f"../ce/default/df_test_pub_score_all_scibert.csv"
    )


    #train_context_filled_citation_path = (
    #    f"../script/data_citation/train_context_filled_citation.csv"
    #)
    #test_context_filled_citation_path = (
    #    f"../script/data_citation/test_context_filled_citation.csv"
    #)

    # oag
    train_oag_bert_sim = f"../data/train_oagbert_title_sim.csv"
    test_pub_oag_bert_sim = f"../data/test_pub_oagbert_title_sim.csv"


    #train_oag_bert_vector = f"../data/train_oagbert_title_vector.csv"
    #test_oag_bert_vector = f"../data/test_pub_oagbert_title_vector.csv"

    #train_oag_bert_ref_vector = f"../data/train_oagbert_ref_vector.csv"
    #test_oag_bert_ref_vector = f"../data/test_pub_oagbert_ref_vector.csv"



In [4]:
from os.path import join


def load_json(rfdir, rfname):
    # logger.info('loading %s ...', rfname)
    with open(join(rfdir, rfname), "r", encoding="utf-8") as rf:
        data = json.load(rf)
        # logger.info('%s loaded', rfname)
        return data


def load_json(rfdir, rfname):
    print("loading %s ...", rfname)
    with open(join(rfdir, rfname), "r", encoding="utf-8") as rf:
        data = json.load(rf)
        print("%s loaded", rfname)
        return data

In [5]:
df_train_dblp = pd.read_csv(Config.train_dblp_path)
df_test_pub_dblp = pd.read_csv(Config.test_pub_dblp_path)

df_train_context = pd.read_csv(Config.train_context_path)
df_train_context_fill = pd.read_csv(Config.train_context_filled_path)
df_train_context = pd.concat([df_train_context, df_train_context_fill[["n_citation", "ref_n_citation"]]], axis=1)

df_test_pub_context = pd.read_csv(Config.test_pub_context_path)

df_train_abstract_title_vector = pd.read_csv(Config.train_abstract_title_vector_path)
df_test_pub_abstract_title_vector = pd.read_csv(Config.test_pub_abstract_title_vector_path)

df_train_abstract_vector = pd.read_csv(Config.train_abstract_vector_path)
df_test_pub_abstract_vector = pd.read_csv(Config.test_pub_abstract_vector_path)

df_train_title_vector = pd.read_csv(Config.train_title_vector_path)
df_test_pub_title_vector = pd.read_csv(Config.test_pub_title_vector_path)

df_train_ref_title_ref_abstract_vector = pd.read_csv(
    Config.train_ref_title_ref_abstract_vector_path
)
df_test_pub_ref_title_ref_abstract_vector = pd.read_csv(
    Config.test_pub_ref_title_ref_abstract_vector_path
)

df_train_ref_title_vector = pd.read_csv(Config.train_ref_title_vector_path)
df_test_pub_ref_title_vector = pd.read_csv(Config.test_pub_ref_title_vector_path)

df_train_ref_abstract_vector = pd.read_csv(Config.train_ref_abstract_vector_path)
df_test_pub_ref_abstract_vector = pd.read_csv(Config.test_pub_ref_abstract_vector_path)

df_train_context_vector = pd.read_csv(Config.train_context_vector_path)
df_test_pub_context_vector = pd.read_csv(Config.test_pub_context_vector_path)


df_train_scibert_sim_vector = pd.read_csv(Config.train_scibert_sim_vector_path)
df_test_pub_scibert_sim_vector = pd.read_csv(Config.test_pub_scibert_sim_vector_path)


df_train_scibert_keyword_vector = pd.read_csv(Config.train_keyword_vector_path)
df_test_pub_scibert_keyword_vector = pd.read_csv(Config.test_pub_keyword_vector_path)

df_train_scibert_ref_keyword_vector = pd.read_csv(Config.train_ref_keyword_vector_path)
df_test_pub_scibert_ref_keyword_vector = pd.read_csv(Config.test_pub_ref_keyword_vector_path)


df_train_org_vector = pd.read_csv(Config.train_org_vector_path)
df_test_pub_org_vector = pd.read_csv(Config.test_pub_org_vector_path)

df_train_ref_org_vector = pd.read_csv(Config.train_ref_org_vector_path)
df_test_pub_ref_org_vector = pd.read_csv(Config.test_pub_ref_org_vector_path)

df_train_venue_vector = pd.read_csv(Config.train_venue_vector_path)
df_test_pub_venue_vector = pd.read_csv(Config.test_pub_venue_vector_path)

df_train_ref_venue_vector = pd.read_csv(Config.train_ref_venue_vector_path)
df_test_pub_ref_venue_vector = pd.read_csv(Config.test_pub_ref_venue_vector_path)

df_train_title_abst_context_kw_vector = pd.read_csv(
    Config.train_title_abst_context_kw_vector_path
)
df_test_pub_title_abst_context_kw_vector = pd.read_csv(
    Config.test_pub_title_abst_context_kw_vector_path
)

df_train_ref_title_ref_abstract_ref_kw_vector = pd.read_csv(
    Config.train_ref_title_ref_abstract_ref_kw_vector_path
)
df_test_pub_ref_title_ref_abstract_ref_kw_vector = pd.read_csv(
    Config.test_pub_ref_title_ref_abstract_ref_kw_vector_path
)

df_train_ce_score = pd.read_csv(Config.train_ce_score_path).rename(columns={"pred": "scibert_ce_score"})
df_test_pub_ce_score = pd.read_csv(Config.test_pub_ce_score_path, usecols=["pid", "bid", "score_mean"]).rename(
    columns={"score_mean": "scibert_ce_score"}
)


df_train_oagbert_sim = pd.read_csv(Config.train_oag_bert_sim)
df_test_pub_oagbert_sim = pd.read_csv(Config.test_pub_oag_bert_sim)

# df_train_oagbert_ref_vector = pd.read_csv(Config.train_oag_bert_ref_vector)
# df_test_oagbert_ref_vector = pd.read_csv(Config.test_oag_bert_ref_vector)

# df_train_oagbert_vector = pd.read_csv(Config.train_oag_bert_vector)
# df_test_oagbert_vector = pd.read_csv(Config.test_oag_bert_vector)


# df_train_citation = pd.read_csv(
#     Config.train_context_filled_citation_path, usecols=["ref_n_citation"]
# )
# df_test_citation = pd.read_csv(
#     Config.test_context_filled_citation_path, usecols=["ref_n_citation"]
# )


In [6]:
print(
    df_train_context.shape,
    df_train_abstract_title_vector.shape,
    df_train_abstract_vector.shape,
    df_train_title_vector.shape,
    df_train_ref_title_ref_abstract_vector.shape,
    df_train_ref_title_vector.shape,
    df_train_ref_abstract_vector.shape,
    df_train_context_vector.shape,
    df_train_scibert_sim_vector.shape,
    df_train_scibert_keyword_vector.shape,
    df_train_scibert_ref_keyword_vector.shape,
    df_train_org_vector.shape,
    df_train_ref_org_vector.shape,
    df_train_venue_vector.shape,
    df_train_ref_venue_vector.shape,
    df_train_title_abst_context_kw_vector.shape,
    df_train_ref_title_ref_abstract_ref_kw_vector.shape,
    df_train_dblp.shape,
)

print(
    df_test_pub_context.shape,
    df_test_pub_abstract_title_vector.shape,
    df_test_pub_abstract_vector.shape,
    df_test_pub_title_vector.shape,
    df_test_pub_ref_title_ref_abstract_vector.shape,
    df_test_pub_ref_title_vector.shape,
    df_test_pub_ref_abstract_vector.shape,
    df_test_pub_context_vector.shape,
    df_test_pub_scibert_sim_vector.shape,
    df_test_pub_scibert_keyword_vector.shape,
    df_test_pub_scibert_ref_keyword_vector.shape,
    df_test_pub_org_vector.shape,
    df_test_pub_ref_org_vector.shape,
    df_test_pub_venue_vector.shape,
    df_test_pub_ref_venue_vector.shape,
    df_test_pub_title_abst_context_kw_vector.shape,
    df_test_pub_ref_title_ref_abstract_ref_kw_vector.shape,
    df_test_pub_dblp.shape,
)

(30893, 34) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (30893, 434) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (30893, 22) (24076, 15)
(21003, 33) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (21003, 434) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (21003, 22) (15703, 14)


In [7]:
def normalize_by_pid_feature(df):

    dfs = []
    for col in df.columns:
        if (
            ("count" in col)
            or ("ce_score" in col)
            or ("num" in col)
            or ("oag_bert_cos_sim" in col)
            or ("year_diff" in col)
            or ("n_citation" in col)
        ) and ("norm" not in col):

            normilized_by_group = (
                df.groupby("pid")[col]
                .transform(lambda x: x / x.sum() if x.sum() != 0 else x)
                .tolist()
            )
            sub_mean_by_group = (
                df.groupby("pid")[col]
                .transform(lambda x: x - x.mean() if x.mean() != 0 else x)
                .tolist()
            )

            sub_max_by_group = (
                df.groupby("pid")[col]
                .transform(lambda x: x - x.max() if x.max() != 0 else x)
                .tolist()
            )

            sub_min_by_group = (
                df.groupby("pid")[col]
                .transform(lambda x: x - x.min() if x.min() != 0 else x)
                .tolist()
            )

            _df = pd.DataFrame(
                {
                    f"{col}_normalized_by_group": normilized_by_group,
                    f"{col}_sub_mean_by_group": sub_mean_by_group,
                    f"{col}_sub_max_by_group": sub_max_by_group,
                    f"{col}_sub_min_by_group": sub_min_by_group,
                }
            )

            dfs.append(_df)

    df = pd.concat([df] + dfs, axis=1)
    return df


def apply_log(df):

    dfs = []
    for col in df.columns:
        if ("count_normalized_by_group" in col) or (
            "ce_score_normalized_by_group" in col
        ):
            normilized_by_group = df[col].apply(lambda x: np.log1p(x)).copy()
            sub_mean_by_group = df[col].apply(lambda x: np.log1p(x)).copy()

            _df = pd.DataFrame(
                {
                    f"{col}_log": normilized_by_group,
                    f"{col}_log": sub_mean_by_group,
                }
            )

            dfs.append(_df)

    df = pd.concat([df] + dfs, axis=1)
    return df

In [11]:
def fill_missing_group_mean(df):

    for col in df.columns:
        if (
            df[col].isnull().sum() > 0
            and df[col].dtype != "object"
            and col != "pid"
            and col != "bid"
            and col != "ref_id"
            and col != "ref_id"
        ):
            df[col] = df.groupby("pid")[col].transform(lambda x: x.fillna(x.mean()))
            df[col] = df[col].fillna(0)

    return df

In [12]:
def merge_dblp(df, df_dblp):
    df = df.merge(
        df_dblp.drop("label", axis=1, errors="ignore"),
        on=["pid", "ref_pid"],
        how="left",
    )

    return df


def merge_feature_title_context_ref_title_abstract(
    df_context,
    df_feature_title_vector,
    df_feature_ref_title_vector,
    df_feature_context_vector,
    df_feature_keyword_vector,
    df_feature_ref_keyword_vector,
    df_feature_abstract_vector,
    df_feature_ref_abstract_vector,
    df_feature_org_vector,
    df_feature_ref_org_vector,
    df_feature_venue_vector,
    df_feature_ref_venue_vector,
    df_feature_title_abst_context_kw_vector,
    df_feature_ref_title_ref_abstract_vector,
    df_feature_scibert_sim,
    #df_feature_oagbert_vector,
    #df_feature_oagbert_ref_vector,
    df_feature_oagbert_sim,
    df_feature_ce_score,
    #df_feature_scibert_ce_score_ubm_abst,
    #df_feature_scibert_ce_score_ubm,
    #df_feature_scibert_ce_score_ubm2,
    # df_featuer_scibert_ce_score_ocai,
    #df_feature_citation,
    df_feature_dblp,
    mode="train",
):
    assert mode in ["train", "test"]

    if mode == "train":
        #base_col = ["pid", "bid", "ref_pid", "n_citation", "ref_n_citation", "label"]
        base_col = ["pid", "bid", "ref_pid", "ref_n_citation", "label"]
    else:
        #base_col = ["pid", "bid", "ref_pid", "n_citation", "ref_n_citation"]
        base_col = ["pid", "bid", "ref_pid", "ref_n_citation"]


    df_feature = pd.concat(
        [
            df_context[base_col],
            # df_feature_title_vector.drop(["pid", "bid"], axis=1),
            # df_feature_ref_title_vector.drop(["pid", "bid"], axis=1),
            # df_feature_context_vector.drop(["pid", "bid"], axis=1),
            # df_feature_keyword_vector.drop(["pid", "bid"], axis=1),
            # df_feature_ref_keyword_vector.drop(["pid", "bid"], axis=1),
            # df_feature_abstract_vector.drop(["pid", "bid"], axis=1),
            # df_feature_ref_abstract_vector.drop(["pid", "bid"], axis=1),
            # df_feature_org_vector.drop(["pid", "bid"], axis=1),
            # df_feature_ref_org_vector.drop(["pid", "bid"], axis=1),
            # df_feature_venue_vector.drop(["pid", "bid"], axis=1),
            # df_feature_ref_venue_vector.drop(["pid", "bid"], axis=1),

            #df_feature_title_abst_context_kw_vector.drop(["pid", "bid"], axis=1),
            #df_feature_ref_title_ref_abstract_vector.drop(["pid", "bid"], axis=1),
            df_feature_scibert_sim.drop(["pid", "bid"], axis=1),
            df_feature_ce_score.drop(["pid", "bid"], axis=1),

            #df_feature_scibert_ce_score_ubm_abst.drop(["pid", "bid"], axis=1),
            #df_feature_scibert_ce_score_ubm.drop(["pid", "bid"], axis=1),
            #df_feature_scibert_ce_score_ubm2.drop(["pid", "bid"], axis=1),
            #df_feature_citation.reset_index(drop=True),
            #df_feature_oagbert_vector.drop(["pid", "bid"], axis=1),
            #df_feature_oagbert_ref_vector.drop(["pid", "bid"], axis=1),
            df_feature_oagbert_sim.drop(["pid", "bid"], axis=1),
            # df_featuer_scibert_ce_score_ocai.drop(["pid", "bid"], axis=1),
        ],
        axis=1,
    )

    # df_feature["is_survey"] = (
    #     df_context["title"].fillna("").str.contains("survey").astype(int)
    # )

    # df_feature["year_diff"] = df_context["year"].astype(float) - df_context[
    #     "ref_year"
    # ].astype(float)

    cos_sim_cols = [col for col in df_feature.columns if "cos_sim" in col]
    df_feature["sim_multiple"] = df_feature[cos_sim_cols].prod(axis=1)

    # df_feature = merge_dblp(df_feature, df_feature_dblp)
    # df_feature["ref_n_citation"] = df_feature["ref_n_citation"].fillna(
    #     df_feature["referenced_num"]
    # )
    #df_feature = df_feature.drop(["referenced_num"], axis=1)
    #df_feature = df_feature.drop(["reciprocal_of_reference_num"], axis=1)

    # drop_cols = [
    #      col
    #      for col in df_feature.columns
    #      if (("cibert_cos_sim" in col )  and ("ref_introduction") in col)
    # ]

    # df_feature = df_feature.drop(drop_cols, axis=1)

    return df_feature

In [13]:
df_train_feature = merge_feature_title_context_ref_title_abstract(
    df_train_context,
    df_train_title_vector,
    df_train_ref_title_vector,
    df_train_context_vector,
    df_train_scibert_keyword_vector,
    df_train_scibert_ref_keyword_vector,
    df_train_abstract_vector,
    df_train_ref_abstract_vector,
    df_train_org_vector,
    df_train_ref_org_vector,
    df_train_venue_vector,
    df_train_ref_venue_vector,

    df_train_title_abst_context_kw_vector,
    df_train_ref_title_ref_abstract_vector,
    df_train_scibert_sim_vector,
    # df_train_scibert_sim_vector2,
    df_train_oagbert_sim,
    #df_train_oagbert_vector,
    #df_train_oagbert_ref_vector,
    df_train_ce_score,
    #df_train_scibert_ce_score_ubm_abst_0517,
    #df_train_scibert_ce_score_ubm_seed10,
    #df_train_scibert_ce_score_ubm_seed123,
    #df_train_citation,
    # df_train_scibert_ce_score_ocai,
    df_train_dblp,
)


df_test_feature = merge_feature_title_context_ref_title_abstract(
    df_test_pub_context,
    df_test_pub_title_vector,
    df_test_pub_ref_title_vector,
    df_test_pub_context_vector,
    df_test_pub_scibert_keyword_vector,
    df_test_pub_scibert_ref_keyword_vector,
    df_test_pub_abstract_vector,
    df_test_pub_ref_abstract_vector,
    df_test_pub_org_vector,
    df_test_pub_ref_org_vector,
    df_test_pub_venue_vector,
    df_test_pub_ref_venue_vector,

    df_test_pub_title_abst_context_kw_vector,
    df_test_pub_ref_title_ref_abstract_vector,
    df_test_pub_scibert_sim_vector,
    # df_test_scibert_sim_vector2,
    df_test_pub_oagbert_sim,
    #df_test_oagbert_vector,
    #df_test_oagbert_ref_vector,
    df_test_pub_ce_score,
    #df_test_scibert_ce_score_ubm_abst_0517,
    #df_test_scibert_ce_score_ubm_seed10,
    #df_test_scibert_ce_score_ubm_seed123,
    #df_test_citation,
    # df_test_scibert_ce_score_ocai,
    df_test_pub_dblp,
    mode="test",
)

df_train_feature = normalize_by_pid_feature(df_train_feature)
df_test_feature = normalize_by_pid_feature(df_test_feature)

df_train_feature = apply_log(df_train_feature)
df_test_feature = apply_log(df_test_feature)

In [14]:
df_train_context.shape, df_test_pub_context.shape

((30893, 34), (21003, 33))

In [15]:
df_train_feature.shape, df_test_feature.shape

((30893, 881), (21003, 880))

In [16]:
output_dir

'../output/feat_without_emb'

In [17]:

os.makedirs(output_dir)
df_train_feature.to_csv(f"{output_dir}/df_train_feature.csv", index=False)
df_test_feature.to_csv(f"{output_dir}/df_test_feature.csv", index=False)