In [1]:
import pandas as pd
import numpy as np

import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold,GroupKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
import gc
import re
from sklearn.metrics import roc_auc_score
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
import polars as pl
from pathlib import Path
from glob import glob
import json
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_word2vec_feature(seq,emb,feat,ikx,ext='',prex = '',feature=[]):
    sentence = [[str(x) for x in x] for x in seq]
    if os.path.exists('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext)):
        model = Word2Vec.load('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext))
    else:
        model = Word2Vec(sentence, vector_size=emb, window=5, min_count=1, workers=8, epochs=10, sg=1, seed=42)
        model.save('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext))
    return model

def generate_w2v_feat(df,prex,col_name,dim):

    feature = []
    f = col_name
    dim = dim

    
    res = df.groupby(prex)[f].apply(lambda x:list(x)).reset_index()
    model = get_word2vec_feature(res[f].values,dim,[prex,f],f,ext='{}'.format(dim),prex = prex,feature=[])

    i = 0
    emb_matrix = []
    for col in tqdm(res[f].values):
        tmp = [model.wv[str(seq)] for seq in col]
        tmp = np.mean(tmp,axis = 0)
        emb_matrix.append(tmp)
    emb_matrix = np.array(emb_matrix)

    for i in range(dim):
        res['{}_{}_{}'.format(prex,f + '_emb_mean',i)] = emb_matrix[:,i]
        feature.append('{}_{}_{}'.format(prex,f + '_emb_mean',i))



    joblib.dump(res[[prex,col_name] +feature ],'feats/w2v_{}_{}_emb.pkl'.format(prex,f))

In [3]:
def tfidf_char(input_values, output_num, output_prefix, seed=1024):
    tfidf_enc = TfidfVectorizer(ngram_range=(1, 4), analyzer="char_wb")
    #tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    #tfidf_enc = TfidfVectorizer()

    tfidf_vec = tfidf_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(tfidf_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_tfidf_char_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp

def tfidf_word(input_values, output_num, output_prefix, seed=1024):
    tfidf_enc = TfidfVectorizer(ngram_range=(1, 4),sublinear_tf = True)
    #tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    #tfidf_enc = TfidfVectorizer()

    tfidf_vec = tfidf_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(tfidf_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_tfidf_word_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp

def count2vec(input_values, output_num, output_prefix, seed=1024):
    #count_enc = CountVectorizer(ngram_range=(1, 3), analyzer="char_wb")
    count_enc = CountVectorizer(ngram_range=(1, 4))

    count_vec = count_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(count_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_countvec_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp


def  get_tfidf(tmp,group_id, group_target, num):
    #tmp[group_target] = tmp[group_target].apply(lambda x: ' '.join(x))
    tfidf_tmp1 = tfidf_word(tmp[group_target], num, group_target)
    #tfidf_tmp2 = tfidf_char(tmp[group_target], num, group_target)

    count_tmp = count2vec(tmp[group_target], num, group_target)
    return pd.concat([tmp[group_id], tfidf_tmp1,count_tmp], axis=1)
    #return pd.concat([tmp[group_id], tfidf_tmp], axis=1)

In [4]:
with open('../IND-WhoIsWho/pid_to_info_all.json', 'r') as file:
    pid = json.load(file)
train = pd.read_feather('data/train.feather')
valid = pd.read_feather('data/valid.feather')
test = pd.read_feather('data/test.feather')

piddf = joblib.load('data/pid_df.pkl')

data = pd.concat([train,valid,test]).reset_index(drop = True)

In [5]:
def text_clean(x,mode = 1):
    x = x.lower()
    for f in list('-?=～—/_？）￥:#\\\'.》”^>$]}|+)、（&{`《,(%!“<’"】；【‘~*@…：，。[;') :
        x = x.replace(f,'')
        
    if mode == 'venue':
        number_pattern = r'\d+'
        x = re.sub(number_pattern, '', x)
    
    
    for i in range(3):
        x = x.replace('  ',' ')
    return x
piddf['title'] = piddf['title'].apply(text_clean)
piddf['abstract'] = piddf['abstract'].apply(text_clean)

piddf['venue'] = piddf['venue'].fillna('')
piddf['venue'] = piddf['venue'].apply(lambda x:text_clean(x,mode = 'venue'))

In [6]:
piddf = piddf.reset_index(drop = True)
piddf['title'] = piddf['title'].apply(lambda x:x.lower())
piddf['index'] = piddf.index + 1

for f in [
       'title']:
    print(f)
    piddf[f] = piddf[f].fillna('')
    #data[feat + '_v_len'] = data[feat].apply(lambda x:x.count(' '))
    tmp = piddf[['index',f]]

    tfidf_df = get_tfidf(tmp, ['index'], f, 32)
    for f in tfidf_df.columns[1:]:
        piddf[f] = tfidf_df[f]

    del tmp,tfidf_df
temp = piddf[['id']  +list(piddf.columns[8:])]
temp.columns = ['PID'] +  list(temp.columns[1:])
temp.to_feather('feats/title_tfidf2vec_feat.feather')

title


In [7]:
pid_title_dict = dict(zip(temp['PID'],temp[temp.columns[1:]].values))
mean_vec = temp[temp.columns[1:]].values.mean(axis = 0)

def cos_similarity(target, embedding):
    numerator = np.sum(target * embedding, axis=1)
    denominator = np.sqrt(np.sum(np.square(target)) * np.sum(np.square(embedding),axis=1))
    return numerator / denominator
ans = []
for autherID in tqdm(data['autherID'].unique()):
    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    for f1 in pid_list:
        x1 = pid_title_dict.get(f1,mean_vec)
        x2 = [f for f in pid_list if f != f1]
        x2 = [pid_title_dict.get(f,mean_vec) for f in x2]
        ans.append(list(cos_similarity(x1, x2)))
data['pid_title_sim'] = ans


for f in ['pid_title_sim']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x))
    data[f + '_max'] = data[f].apply(lambda x:np.max(x))
    data[f + '_min'] = data[f].apply(lambda x:np.min(x))
    data[f + '_std'] = data[f].apply(lambda x:np.std(x))
    data[f + '_median'] = data[f].apply(lambda x:np.median(x))
data = data.drop(['autherName'],axis = 1)

data.to_feather('feats/title_tfidf_sim.feather')

100%|██████████| 15/15 [00:01<00:00,  9.20it/s]


In [12]:
piddf = piddf.reset_index(drop = True)
piddf['abstract'] = piddf['abstract'].apply(lambda x:x.lower())
piddf['index'] = piddf.index + 1

for f in [
       'abstract']:
    print(f)
    piddf[f] = piddf[f].fillna('')
    #data[feat + '_v_len'] = data[feat].apply(lambda x:x.count(' '))
    tmp = piddf[['index',f]]

    tfidf_df = get_tfidf(tmp, ['index'], f, 32)
    for f in tfidf_df.columns[1:]:
        piddf[f] = tfidf_df[f]

    del tmp,tfidf_df
    


temp = piddf[['id']  +[f for f in piddf.columns if 'abstract_' in f]]
temp.columns = ['PID'] +  list(temp.columns[1:])
temp.to_feather('feats/abstract_abstract2vec_feat.feather')

abstract


In [13]:
pid_abstract_dict = dict(zip(temp['PID'],temp[temp.columns[1:]].values))
mean_vec = temp[temp.columns[1:]].values.mean(axis = 0)

def cos_similarity(target, embedding):
    numerator = np.sum(target * embedding, axis=1)
    denominator = np.sqrt(np.sum(np.square(target)) * np.sum(np.square(embedding),axis=1))
    return numerator / denominator
ans = []
for autherID in tqdm(data['autherID'].unique()):

    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    
    for f1 in pid_list:
        x1 = pid_abstract_dict.get(f1,mean_vec)
        x2 = [f for f in pid_list if f != f1]
        x2 = [pid_abstract_dict.get(f,mean_vec) for f in x2]
        ans.append([f for f in list(cos_similarity(x1, x2)) if f >-10])

data['pid_abstract_sim'] = ans

#data = data.drop([ 'pid_title_sim', 'pid_title_sim_mean',
#       'pid_title_sim_max', 'pid_title_sim_min', 'pid_title_sim_std',
#       'pid_title_sim_median',],axis = 1)


for f in ['pid_abstract_sim']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x) if len(x) > 0 else np.nan)
    data[f + '_max'] = data[f].apply(lambda x:np.max(x) if len(x) > 0 else np.nan)
    data[f + '_min'] = data[f].apply(lambda x:np.min(x) if len(x) > 0 else np.nan)
    data[f + '_std'] = data[f].apply(lambda x:np.std(x) if len(x) > 0 else np.nan)
    data[f + '_median'] = data[f].apply(lambda x:np.median(x) if len(x) > 0 else np.nan)


data.to_feather('feats/abstract_tfidf_sim.feather')

100%|██████████| 15/15 [00:01<00:00,  7.93it/s]


In [14]:
123

123

In [15]:
piddf['keyword_text'] = piddf['keywords'].apply(lambda x:(' '.join(x)).lower())
piddf['keyword_text'] = piddf['keyword_text'].fillna('')
piddf = piddf.reset_index(drop = True)
piddf['index'] = piddf.index + 1

for f in [
       'keyword_text']:
    print(f)
    piddf[f] = piddf[f].fillna('')
    #data[feat + '_v_len'] = data[feat].apply(lambda x:x.count(' '))
    tmp = piddf[['index',f]]

    tfidf_df = get_tfidf(tmp, ['index'], f, 16)
    for f in tfidf_df.columns[1:]:
        piddf[f] = tfidf_df[f]

    del tmp,tfidf_df
    


temp = piddf[['id']  +[f for f in piddf.columns if 'keyword_text_' in f]]
temp.columns = ['PID'] +  list(temp.columns[1:])
temp.to_feather('feats/keyword_text_keyword2vec_feat.feather')

keyword_text


In [16]:
pid_keyword_text_dict = dict(zip(temp['PID'],temp[temp.columns[1:]].values))
mean_vec = temp[temp.columns[1:]].values.mean(axis = 0)

def cos_similarity(target, embedding):
    numerator = np.sum(target * embedding, axis=1)
    denominator = np.sqrt(np.sum(np.square(target)) * np.sum(np.square(embedding),axis=1))
    return numerator / denominator
ans = []
for autherID in tqdm(data['autherID'].unique()):

    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    
    for f1 in pid_list:
        x1 = pid_keyword_text_dict.get(f1,mean_vec)
        x2 = [f for f in pid_list if f != f1]
        x2 = [pid_keyword_text_dict.get(f,mean_vec) for f in x2]
        ans.append([f for f in list(cos_similarity(x1, x2)) if f >-10])

data['pid_keyword_text_sim'] = ans

#data = data.drop([ 'pid_title_sim', 'pid_title_sim_mean',
#       'pid_title_sim_max', 'pid_title_sim_min', 'pid_title_sim_std',
#       'pid_title_sim_median',],axis = 1)


for f in ['pid_keyword_text_sim']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x) if len(x) > 0 else np.nan)
    data[f + '_max'] = data[f].apply(lambda x:np.max(x) if len(x) > 0 else np.nan)
    data[f + '_min'] = data[f].apply(lambda x:np.min(x) if len(x) > 0 else np.nan)
    data[f + '_std'] = data[f].apply(lambda x:np.std(x) if len(x) > 0 else np.nan)
    data[f + '_median'] = data[f].apply(lambda x:np.median(x) if len(x) > 0 else np.nan)


data.to_feather('feats/keyword_text_tfidf_sim.feather')

100%|██████████| 15/15 [00:01<00:00,  8.99it/s]


In [17]:
#123456789

In [18]:

piddf = piddf.reset_index(drop = True)
piddf['venue'] = piddf['venue'].fillna('')
piddf['venue'] = piddf['venue'].apply(lambda x:x.lower())
piddf['index'] = piddf.index + 1

for f in [
       'venue']:
    print(f)
    piddf[f] = piddf[f].fillna('')
    #data[feat + '_v_len'] = data[feat].apply(lambda x:x.count(' '))
    tmp = piddf[['index',f]]

    tfidf_df = get_tfidf(tmp, ['index'], f, 16)
    for f in tfidf_df.columns[1:]:
        piddf[f] = tfidf_df[f]

    del tmp,tfidf_df
temp = piddf[['id']  +[f for f in piddf.columns if 'venue_' in f]]
temp.columns = ['PID'] +  list(temp.columns[1:])
temp.to_feather('feats/venue_venue2vec_feat.feather')

pid_venue_dict = dict(zip(temp['PID'],temp[temp.columns[1:]].values))
mean_vec = temp[temp.columns[1:]].values.mean(axis = 0)

def cos_similarity(target, embedding):
    numerator = np.sum(target * embedding, axis=1)
    denominator = np.sqrt(np.sum(np.square(target)) * np.sum(np.square(embedding),axis=1))
    return numerator / denominator
ans = []
for autherID in tqdm(data['autherID'].unique()):

    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    
    for f1 in pid_list:
        x1 = pid_venue_dict.get(f1,mean_vec)
        x2 = [f for f in pid_list if f != f1]
        x2 = [pid_venue_dict.get(f,mean_vec) for f in x2]
        ans.append([f for f in list(cos_similarity(x1, x2)) if f >-10])

data['pid_venue_sim'] = ans

#data = data.drop([ 'autherName'],axis = 1)


for f in ['pid_venue_sim']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x) if len(x) > 0 else np.nan)
    data[f + '_max'] = data[f].apply(lambda x:np.max(x) if len(x) > 0 else np.nan)
    data[f + '_min'] = data[f].apply(lambda x:np.min(x) if len(x) > 0 else np.nan)
    data[f + '_std'] = data[f].apply(lambda x:np.std(x) if len(x) > 0 else np.nan)
    data[f + '_median'] = data[f].apply(lambda x:np.median(x) if len(x) > 0 else np.nan)


data.to_feather('feats/venue_tfidf_sim.feather')

venue


100%|██████████| 15/15 [00:01<00:00,  8.47it/s]


In [19]:
data = pd.concat([train,valid,test]).reset_index(drop = True)
piddf = joblib.load('data/pid_df.pkl')


In [20]:


ans = []
for pid,keyword in tqdm(piddf[['id','keywords']].values):
    if len(keyword)>0 and type(keyword)== list:
        for key in keyword:
            ans.append([pid,key])
df = pd.DataFrame(ans,columns = ['PID','keyword'])
df['keyword'] = df['keyword'].apply(lambda x:x.lower())
df = df.groupby('PID')['keyword'].agg(list).reset_index()
pid_keyword_dict = dict(df.values)

ans = []
for autherID in tqdm(data['autherID'].unique()):
    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    for f1 in pid_list:
        x1 = set(pid_keyword_dict.get(f1,[]))
        temp = []
        for f2 in pid_list:
            if f1!= f2:
                x2 = set(pid_keyword_dict.get(f2,[]))
                temp.append([len(x1 & x2),len(x1| x2)])
        ans.append(temp.copy())
        
data['pid_sim'] = ans
data['pid_keyword_cnt_1'] = data['pid_sim'].apply(lambda x:[f[0] for f in x])
data['pid_keyword_cnt_2'] = data['pid_sim'].apply(lambda x:[f[1] for f in x])
data['pid_keyword_cnt_3'] = data['pid_sim'].apply(lambda x:[(f[0]+1) / (f[1] + 1) for f in x])


for f in ['pid_keyword_cnt_1','pid_keyword_cnt_2','pid_keyword_cnt_3']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x))
    data[f + '_max'] = data[f].apply(lambda x:np.max(x))
    data[f + '_min'] = data[f].apply(lambda x:np.min(x))
    data[f + '_std'] = data[f].apply(lambda x:np.std(x))
    data[f + '_median'] = data[f].apply(lambda x:np.median(x))

data = data.drop(['autherName','pid_keyword_cnt_1','pid_keyword_cnt_2','pid_keyword_cnt_3'],axis = 1)
data.to_feather('feats/keywords_jaccard_sim.feather')

100%|██████████| 2957/2957 [00:00<00:00, 3130.40it/s]
100%|██████████| 15/15 [00:02<00:00,  5.23it/s]


In [21]:
piddf = joblib.load('data/pid_df.pkl')

ans = []
for pid,auther in tqdm(piddf[['id','authors']].values):
    if len(auther)>0 and type(auther)== list:
        for key in auther:
            ans.append([pid,key['name'],key['org']])
df = pd.DataFrame(ans,columns = ['PID','autherName','author_org'])
df['autherName'] = df['autherName'].apply(lambda x:x.lower())
df['author_org'] = df['author_org'].apply(lambda x:x.lower())

from pypinyin import pinyin, Style
def chinese2ping_name(x):
    x = pinyin(x,Style.NORMAL)
    #print(x)
    x1 = x[0][0]
    x2 = ''.join([f[0] for f in x[1:]])
    #print(x1,x2)
    return x2 + ' ' + x1


def is_contain_chinese(text):
    # 定义匹配中文字符的正则表达式模式
    chinese_pattern = re.compile(r'[\u4e00-\u9fa5]')
    # 使用正则表达式模式搜索文本
    match = chinese_pattern.search(text)
    # 如果找到匹配的中文字符，则返回True，否则返回False
    return match is not None


df.loc[df['autherName'].apply(lambda x:'高阳' in x),'autherName'] = 'yang gao'
df.loc[(df['autherName'].apply(lambda x:is_contain_chinese(x))),'autherName'] = df.loc[  (df['autherName'].apply(lambda x:is_contain_chinese(x))) ,'autherName'].map(chinese2ping_name)
df['autherName'] = df['autherName'] = df['autherName'].apply(lambda x:x.replace('.',' ').replace('-',' '))
df['revision'] = df['autherName'].apply(lambda x:' '.join(sorted([f for f in x.split(' ') if len(f)>0])))
df = df.groupby('PID')['revision'].agg(list).reset_index()

100%|██████████| 2957/2957 [00:00<00:00, 224379.14it/s]


In [22]:
pid_auther_dict = dict(df.values)

ans = []
for autherID in tqdm(data['autherID'].unique()):
    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    for f1 in pid_list:
        x1 = set(pid_auther_dict.get(f1,[]))
        temp = []
        for f2 in pid_list:
            if f1!= f2:
                x2 = set(pid_auther_dict.get(f2,[]))
                temp.append([len(x1 & x2),len(x1| x2)])
        ans.append(temp.copy())
        
data = pd.concat([train,valid,test]).reset_index(drop = True)
data['pid_sim'] = ans

100%|██████████| 15/15 [00:03<00:00,  4.51it/s]


In [23]:
data['pid_auther_cnt_1'] = data['pid_sim'].apply(lambda x:[f[0] for f in x])
data['pid_auther_cnt_2'] = data['pid_sim'].apply(lambda x:[f[1] for f in x])
data['pid_auther_cnt_3'] = data['pid_sim'].apply(lambda x:[(f[0]+1) / (f[1] + 1) for f in x])

for f in ['pid_auther_cnt_1','pid_auther_cnt_2','pid_auther_cnt_3']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x))
    data[f + '_max'] = data[f].apply(lambda x:np.max(x))
    data[f + '_min'] = data[f].apply(lambda x:np.min(x))
    data[f + '_std'] = data[f].apply(lambda x:np.std(x))
data = data.drop(['autherName','pid_auther_cnt_1','pid_auther_cnt_2','pid_auther_cnt_3'],axis = 1)

In [24]:
data.to_feather('feats/autherName_jaccard_sim.feather')

In [25]:

piddf = joblib.load('data/pid_df.pkl')
data = pd.concat([train,valid,test]).reset_index(drop = True)

ans = []
for pid,auther in tqdm(piddf[['id','authors']].values):
    if len(auther)>0 and type(auther)== list:
        for key in auther:
            ans.append([pid,key['name']])
df = pd.DataFrame(ans,columns = ['PID','autherName'])
df['autherName'] = df['autherName'].apply(lambda x:x.lower())
df = df.groupby('PID')['autherName'].agg(list).reset_index()
pid_auther_dict = dict(df.values)

ans = []
for autherID in tqdm(data['autherID'].unique()):
    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    for f1 in pid_list:
        x1 = set(pid_auther_dict.get(f1,[]))
        temp = []
        for f2 in pid_list:
            if f1!= f2:
                x2 = set(pid_auther_dict.get(f2,[]))
                temp.append([len(x1 & x2),len(x1| x2)])
        ans.append(temp.copy())
        
data['pid_sim'] = ans
data['pid_autherraw_cnt_1'] = data['pid_sim'].apply(lambda x:[f[0] for f in x])
data['pid_autherraw_cnt_2'] = data['pid_sim'].apply(lambda x:[f[1] for f in x])
data['pid_autherraw_cnt_3'] = data['pid_sim'].apply(lambda x:[(f[0]+1) / (f[1] + 1) for f in x])

for f in ['pid_autherraw_cnt_1','pid_autherraw_cnt_2','pid_autherraw_cnt_3']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x))
    data[f + '_max'] = data[f].apply(lambda x:np.max(x))
    data[f + '_min'] = data[f].apply(lambda x:np.min(x))
    data[f + '_std'] = data[f].apply(lambda x:np.std(x))
data = data.drop(['autherName','pid_autherraw_cnt_1','pid_autherraw_cnt_2','pid_autherraw_cnt_3'],axis = 1)
data.to_feather('feats/autherName_raw_jaccard_sim.feather')


100%|██████████| 2957/2957 [00:00<00:00, 247368.40it/s]
100%|██████████| 15/15 [00:03<00:00,  4.21it/s]


In [26]:
123

123

In [27]:
def get_w2v_feature(data,col1,col2,emb_size,ext='',feature=[]):
    print('begin train word2vec')
    data = data[col1 +[col2]]
    data[col2] = data[col2].astype(str)
    tmp = data.groupby(col1)[col2].apply(lambda x:list(x)).reset_index()
    sentences = tmp[col2].values.tolist()
    print(tmp.head())
    del tmp[col2]
    if os.path.exists('w2v/{}_{}_feature{}.model'.format('_'.join(col1),col2,ext)):
        model = Word2Vec.load('w2v/{}_{}_feature{}.model'.format('_'.join(col1),col2,ext))
    else:
        model = Word2Vec(sentences, vector_size=emb_size, window=5, min_count=1, workers=8, epochs=10, sg=1, seed=42)
        model.save('w2v/{}_{}_feature{}.model'.format('_'.join(col1),col2,ext))
    emb_matrix = []
    emb_dict = {}
    print('begin make feature')
    for seq in sentences:
        vec = []
        for w in seq:
            #print(w)
            if w in model.wv:
                vec.append(model.wv[w])
                emb_dict[w] = model.wv[w]
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)
    emb_matrix = np.array(emb_matrix)
    for i in range(emb_size):
        tmp['{}_{}_emb_{}{}'.format('_'.join(col1), col2, i, ext)] = emb_matrix[:, i]
        feature.append('{}_{}_emb_{}{}'.format('_'.join(col1), col2, i,ext))
    del model, emb_matrix, sentences
    new_emb_martix = []
    data_index = []
    for v in emb_dict:
        data_index.append(v)
        tmp_emb = np.array(emb_dict[v])
        new_emb_martix.append(tmp_emb)
    new_emb_martix = np.array(new_emb_martix)
    data = pd.DataFrame()
    data[col2] = data_index
    for i in range(emb_size):
        data['{}_emb_{}_{}'.format(col2, i, ext)] = new_emb_martix[:,i]
        feature.append('{}_emb_{}_{}'.format(col2, i, ext))
    return tmp,feature,data



In [28]:
with open('../IND-WhoIsWho/pid_to_info_all.json', 'r') as file:
    pid = json.load(file)
    
data = pd.concat([train,valid,test]).reset_index(drop = True)
data['keywords'] = data['PID'].apply(lambda x:pid[x]['keywords'])

ans = []
for pid,keyword in tqdm(data[['autherID','keywords']].values):
    if len(keyword)>0 and type(keyword)== list:
        for key in keyword:
            ans.append([pid,key])
df = pd.DataFrame(ans,columns = ['autherID','keyword'])
df['keyword'] = df['keyword'].apply(lambda x:x.lower())
#df = df.groupby('autherID')['keyword'].agg(list).reset_index()

100%|██████████| 2957/2957 [00:00<00:00, 4953.01it/s]


In [29]:
emb_cols = [
    ['autherID', 'keyword'],
    
    # ...
]

for f1,f2 in emb_cols:
    total_feature_1,feature,total_feature_2 = get_w2v_feature(df,[f1],f2,16,ext='16',feature=[])
    #total_feature_1 = reduce_mem(total_feature_1)
    #total_feature_2 = reduce_mem(total_feature_2)
    #total_feature_2[f2] = total_feature_2[f2].astype(int)
    
    data = pd.merge(data,total_feature_1,how='left',on=[f1],copy=False)
    
    ##df = pd.merge(df,total_feature_2,how='left',on=[f2],copy=False)
total_feature_1.to_feather('feats/w2v_feats/autherID_keyword_w2v_emb.feather')
t_dict = dict(zip(total_feature_2['keyword'].to_list(),total_feature_2[total_feature_2.columns[1:]].values))
t_df = data[['autherID','PID','keywords']]
t_df.loc[t_df['keywords'].apply(lambda x:len(x)) !=0,'keywords_emb'] = t_df.loc[t_df['keywords'].apply(lambda x:len(x)) !=0,'keywords'].apply(lambda x:np.mean([t_dict[f.lower()] for f in x],axis = 0))
t_df.loc[t_df['keywords'].apply(lambda x:len(x)) !=0,[f'keywords_auterID_emb_mean_{i}' for i in range(16)]] = np.array(t_df.loc[t_df['keywords'].apply(lambda x:len(x)) !=0,'keywords_emb'].to_list())
t_df[['autherID','PID'] + [f'keywords_auterID_emb_mean_{i}' for i in range(16)]].to_feather('feats/w2v_feats/keyword_autherID_w2v_emb_mean.feather')

begin train word2vec
   autherID                                            keyword
0  9Gs8Wj3Y  [bulk density, infiltration characteristics of...
1  C97iQ0Fj  [adults, brain neoplasms, magnetic resonance i...
2  Fkb16wn7  [visual memory, spatial information, computer ...
3  Iki037dt  [vesicoureteral reflux, bladder hypertrophy, c...
4  KKiBE172  [traditional pid control, self-adjusting syste...
begin make feature


In [30]:
with open('../IND-WhoIsWho/pid_to_info_all.json', 'r') as file:
    pid = json.load(file)
    
data = pd.concat([train,test]).reset_index(drop = True)
data['venue'] = data['PID'].apply(lambda x:pid[x]['venue'])
data['venue'] = data['venue'].fillna('')
data['venue'] = data['venue'].apply(lambda x:text_clean(x))
data['venue'] = data['venue'].apply(lambda x:x.lower())

In [31]:
emb_cols = [
    ['autherID', 'venue'],
    
    # ...
]

for f1,f2 in emb_cols:
    total_feature_1,feature,total_feature_2 = get_w2v_feature(data,[f1],f2,8,ext='8',feature=[])

total_feature_1.to_feather('feats/w2v_feats/autherID_venue_w2v_emb.feather')


begin train word2vec
   autherID                                              venue
0  9Gs8Wj3Y  [journal of soil and water conservation, journ...
1  Fkb16wn7  [eccv 7, computer vision and pattern recogniti...
2  Iki037dt  [annals of oncology, the journal of urology, u...
3  KKiBE172  [proceedings of the national academy of scienc...
4  WXMYBk3c  [chinese journal of clinicianselectronic editi...
begin make feature


In [32]:
123

123

In [33]:
data = pd.concat([train,valid,test]).reset_index(drop = True)

tmp_df = piddf[['id','authors']]
tmp_df['authors_org'] = tmp_df['authors'].apply(lambda x:[f['org'] for f in x])
tmp_df['authors_name'] = tmp_df['authors'].apply(lambda x:[f['name'] for f in x])
tmp_df.columns = ['PID','authors','authors_org','authors_name']
tmp_df = data[['PID','autherID']].merge(tmp_df,on = 'PID',how = 'left')
tmp_df['authors_org'] = tmp_df['authors_org'].apply(lambda x:[f.lower() for f in x ])
tmp_df['authors_name'] = tmp_df['authors_name'].apply(lambda x:[f.lower() for f in x ])

ans = []
for auther,keyword,org,PID in tqdm(tmp_df[['autherID','authors_name','authors_org','PID']].values):
    if len(keyword)>0 and type(keyword)== list:
        for i in range(len(keyword)):
            ans.append([auther,keyword[i],org[i],PID])
    
df = pd.DataFrame(ans,columns = ['autherID','authors_name','authors_org','PID'])
df['auther_authors_name_count'] = df.groupby(['autherID','authors_name'])['PID'].transform('count')
df['autherName'] = df['autherID'].map(dict(data[['autherID','autherName']].values))
df['autherName'] = df['autherName'].apply(lambda x:x.lower())


from pypinyin import pinyin, Style
def chinese2ping_name(x):
    x = pinyin(x,Style.NORMAL)
    #print(x)
    x1 = x[0][0]
    x2 = ''.join([f[0] for f in x[1:]])
    #print(x1,x2)
    return x2 + ' ' + x1


def is_contain_chinese(text):
    # 定义匹配中文字符的正则表达式模式
    chinese_pattern = re.compile(r'[\u4e00-\u9fa5]')
    # 使用正则表达式模式搜索文本
    match = chinese_pattern.search(text)
    # 如果找到匹配的中文字符，则返回True，否则返回False
    return match is not None


df.loc[df['autherName'].apply(lambda x:'高阳' in x),'autherName'] = 'yang gao'
#df.loc[(df['authors_name'].apply(lambda x:len(x) < 4) & (df['authors_name'].apply(lambda x:is_contain_chinese(x))) ),'authors_name'] = df[(df['authors_name'].apply(lambda x:len(x) < 4)) & (df['authors_name'].apply(lambda x:is_contain_chinese(x))),'authors_name'] = df.loc[ (df['authors_name'].apply(lambda x:len(x) < 4)) & (df['authors_name'].apply(lambda x:is_contain_chinese(x))) ,'authors_name'].map(chinese2ping_name)



df['authors_name'] = df['authors_name'].apply(lambda x:x.replace('-','').replace(',',''))
df['autherName'] = df['autherName'].apply(lambda x:x.replace('-','').replace(',',''))
df['authors_name'] = df['authors_name'].apply(lambda x:x.replace('\xa0', ' ') )
df.loc[df['autherName'] == df['authors_name'],'flag'] = 1
df.loc[(df['flag']!= 1) &(df['authors_name'].apply(lambda x:x[1] == ' ' if len(x)>2 else False)),'flag'] = df.loc[(df['flag']!= 1) &(df['authors_name'].apply(lambda x:x[1] == ' ' if len(x)>2 else False))].apply(lambda x:int(x['authors_name'][2:] in x['autherName']),axis = 1)
df.loc[(df['flag']!= 1) &(df['authors_name'].apply(lambda x:x[1] == '.' if len(x)>2 else False)),'flag'] = df.loc[(df['flag']!= 1) &(df['authors_name'].apply(lambda x:x[1] == '.' if len(x)>2 else False))].apply(lambda x:int(x['authors_name'][2:] in x['autherName']),axis = 1)
df.loc[(df['flag']!= 1) &(df['authors_name'].apply(lambda x:' '.join(x.split(' ')[1:]+ x.split(' ')[0:1])) == df['autherName']),'flag'] = 1


100%|██████████| 2957/2957 [00:00<00:00, 3881.77it/s]


In [34]:

from collections import Counter
def swap_name_judge(x1,x2):
    x1 = x1.replace('.','')
    x2 = x2.replace('.','')
    cnt = 0
    cnt += sum(list((Counter(x1)-Counter(x2)).values()))
    cnt += sum(list((Counter(x2)-Counter(x1)).values()))
    
    all_ = len(x1) + len(x2)
    if all_>5 and cnt / all_<0.2:
        return True
    return False

df.loc[(df['flag']!=1)&(df.apply(lambda x:swap_name_judge(x['authors_name'],x['autherName']),axis = 1)),'flag'] = 1


In [35]:
df.to_feather('temp_data/autherID_authors.feather')
#df = pd.read_feather('temp_data/autherID_authors.feather')
