In [1]:
import pandas as pd
import lightgbm
import numpy as np
import math
import os

from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
    precision_recall_curve,
    log_loss,
)
from sklearn.model_selection import StratifiedKFold, GroupKFold
from typing import List, Dict

import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm import LGBMRanker

from tqdm import tqdm

import json
from bs4 import BeautifulSoup
import optuna

from catboost import CatBoostClassifier, Pool, CatBoostRanker
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import boxcox

  from optuna import progress_bar as pbar_module


In [2]:
df_feature = pd.read_csv("../output/dblp_title_2_oag/df_train_feature.csv")
df_test_feature = pd.read_csv("../output/dblp_title_2_oag/df_test_feature.csv")


#df_feature = pd.read_csv("../output/dblp_title_2_oag2/df_train_feature.csv")
#df_test_feature = pd.read_csv("../output/dblp_title_2_oag2/df_test_feature.csv")


In [3]:
df_context = pd.read_csv("../data/train_context.csv")
df_test_context = pd.read_csv("../data/test_pub_gen_context_filled_citation.csv")

In [4]:
cols = [
    col.replace("scibert_cos_sim_", "")
    for col in df_feature.columns
    if "cos_sim" in col
]

target_ref_cols = [col for col in cols if "ref" in col]
target_target_cols = [col for col in cols if "ref" not in col]

target_ref_cols_split = [col.split("_ref_", 1) for col in target_ref_cols]
target_target_col_split = [col.split("_", 1) for col in target_target_cols]

target_ref_cols_split = [
    [split[0], "ref_" + split[1]] for split in target_ref_cols_split
]
target_target_col_split = [[split[0], split[1]] for split in target_target_col_split]

# target_ref_cols = [[split[0], "ref_" + split[1], col] for split, col in zip(target_ref_cols_split, target_ref_cols)]
# target_target_cols = [[split[0], split[1], col] for split, col in zip(target_target_col_split, target_target_cols)]

In [5]:
target_ref_cols_split

[['title', 'ref_title'],
 ['title', 'ref_abstract'],
 ['title', 'ref_keywords'],
 ['title', 'ref_venue'],
 ['title', 'ref_org'],
 ['title', 'ref_introduction'],
 ['title', 'ref_title_ref_abstract'],
 ['title', 'ref_title_ref_abstract_ref_keyword'],
 ['abstract', 'ref_title'],
 ['abstract', 'ref_abstract'],
 ['abstract', 'ref_keywords'],
 ['abstract', 'ref_venue'],
 ['abstract', 'ref_org'],
 ['abstract', 'ref_introduction'],
 ['abstract', 'ref_title_ref_abstract'],
 ['abstract', 'ref_title_ref_abstract_ref_keyword'],
 ['context', 'ref_title'],
 ['context', 'ref_abstract'],
 ['context', 'ref_keywords'],
 ['context', 'ref_venue'],
 ['context', 'ref_org'],
 ['context', 'ref_introduction'],
 ['context', 'ref_title_ref_abstract'],
 ['context', 'ref_title_ref_abstract_ref_keyword'],
 ['keywords', 'ref_title'],
 ['keywords', 'ref_abstract'],
 ['keywords', 'ref_keywords'],
 ['keywords', 'ref_venue'],
 ['keywords', 'ref_org'],
 ['keywords', 'ref_introduction'],
 ['keywords', 'ref_title_ref_abstr

In [6]:
target_target_col_split

[['title', 'context'],
 ['abstract', 'context'],
 ['keywords', 'context'],
 ['venue', 'context'],
 ['org', 'context'],
 ['introduction', 'context'],
 ['conclusion', 'context'],
 ['related', 'work_context'],
 ['title', 'abstract_context'],
 ['title', 'abstract_keyword_context'],
 ['title', 'abstract_context_keyword_context'],
 ['c', 'sentence_inspired'],
 ['c', 'sentence_core'],
 ['c', 'sentence_essential'],
 ['c', 'sentence_all']]

In [7]:
target_target_col_split.remove(["related", "work_context"])
target_target_col_split.append(["related_work", "context"])
target_target_col_split

[['title', 'context'],
 ['abstract', 'context'],
 ['keywords', 'context'],
 ['venue', 'context'],
 ['org', 'context'],
 ['introduction', 'context'],
 ['conclusion', 'context'],
 ['title', 'abstract_context'],
 ['title', 'abstract_keyword_context'],
 ['title', 'abstract_context_keyword_context'],
 ['c', 'sentence_inspired'],
 ['c', 'sentence_core'],
 ['c', 'sentence_essential'],
 ['c', 'sentence_all'],
 ['related_work', 'context']]

In [8]:
df_feature["scibert_cos_sim_context_ref_title"].min()

0.098871544

In [9]:
def clean_cos_sim(df_context, df_feature, cols_split):

    for cols in cols_split:
        target_col = cols[0]
        ref_col = cols[1]

        if target_col == "c":
            feat_col = "scibert_cos_sim_" + "context" + "_" + ref_col
        else:

            feat_col = "scibert_cos_sim_" + target_col + "_" + ref_col

        #if feat_col

        if target_col not in df_context.columns or ref_col not in df_context.columns:
            continue

        # target null & ref null
        #cond1 = (df_context[target_col].isnull()) & (df_context[ref_col].isnull())
        #cond2 = (df_context[target_col].isnull()) & (df_context[ref_col].notnull())
        #cond3
        # 両方nullのじゃない場合を抽出
        cond = ~((df_context[target_col].notnull()) & (df_context[ref_col].notnull()))

        n = df_feature.loc[cond, feat_col].shape[0]
        if n == 0:
            continue
        else:
            print(feat_col)
            print(df_feature.loc[cond, feat_col].shape)

            df_feature.loc[cond, feat_col] = None

        # target null & ref not null

        df_feature[feat_col] = df_feature[["pid", feat_col]].groupby("pid").transform(lambda x: x.fillna(x.mean()))
        df_feature[feat_col] = df_feature[feat_col].fillna(0)

        print(df_feature[feat_col].isnull().sum())

    return df_feature

In [10]:
df_feature = clean_cos_sim(df_context, df_feature, target_ref_cols_split + target_target_col_split)

scibert_cos_sim_title_ref_title
(470,)
0
scibert_cos_sim_title_ref_abstract
(8738,)
0
scibert_cos_sim_title_ref_keywords
(7661,)
0
scibert_cos_sim_title_ref_venue
(8291,)
0
scibert_cos_sim_title_ref_org
(12002,)
0
scibert_cos_sim_title_ref_introduction
(28082,)
0
scibert_cos_sim_abstract_ref_title
(510,)
0
scibert_cos_sim_abstract_ref_abstract
(8749,)
0
scibert_cos_sim_abstract_ref_keywords
(7674,)
0
scibert_cos_sim_abstract_ref_venue
(8304,)
0
scibert_cos_sim_abstract_ref_org
(12005,)
0
scibert_cos_sim_abstract_ref_introduction
(28082,)
0
scibert_cos_sim_context_ref_title
(2509,)
0
scibert_cos_sim_context_ref_abstract
(10059,)
0
scibert_cos_sim_context_ref_keywords
(9057,)
0
scibert_cos_sim_context_ref_venue
(9646,)
0
scibert_cos_sim_context_ref_org
(13082,)
0
scibert_cos_sim_context_ref_introduction
(28229,)
0
scibert_cos_sim_keywords_ref_title
(470,)
0
scibert_cos_sim_keywords_ref_abstract
(8738,)
0
scibert_cos_sim_keywords_ref_keywords
(7661,)
0
scibert_cos_sim_keywords_ref_venue
(

In [11]:
df_test_feature = clean_cos_sim(df_test_context, df_test_feature, target_ref_cols_split + target_target_col_split)

scibert_cos_sim_title_ref_title
(348,)
0
scibert_cos_sim_title_ref_abstract
(5713,)
0
scibert_cos_sim_title_ref_keywords
(5155,)
0
scibert_cos_sim_title_ref_venue
(6596,)
0
scibert_cos_sim_title_ref_org
(7901,)
0
scibert_cos_sim_title_ref_introduction
(18045,)
0
scibert_cos_sim_abstract_ref_title
(378,)
0
scibert_cos_sim_abstract_ref_abstract
(5739,)
0
scibert_cos_sim_abstract_ref_keywords
(5181,)
0
scibert_cos_sim_abstract_ref_venue
(6622,)
0
scibert_cos_sim_abstract_ref_org
(7922,)
0
scibert_cos_sim_abstract_ref_introduction
(18049,)
0
scibert_cos_sim_context_ref_title
(2567,)
0
scibert_cos_sim_context_ref_abstract
(7161,)
0
scibert_cos_sim_context_ref_keywords
(6676,)
0
scibert_cos_sim_context_ref_venue
(7958,)
0
scibert_cos_sim_context_ref_org
(9166,)
0
scibert_cos_sim_context_ref_introduction
(18260,)
0
scibert_cos_sim_keywords_ref_title
(348,)
0
scibert_cos_sim_keywords_ref_abstract
(5713,)
0
scibert_cos_sim_keywords_ref_keywords
(5155,)
0
scibert_cos_sim_keywords_ref_venue
(6596

In [12]:
dir_path = "../output/dblp_title_2_oag_clean"

In [13]:

os.makedirs(dir_path, exist_ok=True)

import re
df_feature = df_feature.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df_test_feature = df_test_feature.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

df_feature.to_csv(f"{dir_path}/df_train_feature.csv")
df_test_feature.to_csv(f"{dir_path}/df_test_feature.csv")