In [1]:
import pandas as pd
import numpy as np

import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold,GroupKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
import gc
import re
from sklearn.metrics import roc_auc_score
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
import polars as pl
from pathlib import Path
from glob import glob
import json
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_word2vec_feature(seq,emb,feat,ikx,ext='',prex = '',feature=[]):
    sentence = [[str(x) for x in x] for x in seq]
    if os.path.exists('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext)):
        model = Word2Vec.load('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext))
    else:
        model = Word2Vec(sentence, vector_size=emb, window=5, min_count=1, workers=8, epochs=10, sg=1, seed=42)
        model.save('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext))
    return model

def generate_w2v_feat(df,prex,col_name,dim):

    feature = []
    f = col_name
    dim = dim

    
    res = df.groupby(prex)[f].apply(lambda x:list(x)).reset_index()
    model = get_word2vec_feature(res[f].values,dim,[prex,f],f,ext='{}'.format(dim),prex = prex,feature=[])

    i = 0
    emb_matrix = []
    for col in tqdm(res[f].values):
        tmp = [model.wv[str(seq)] for seq in col]
        tmp = np.mean(tmp,axis = 0)
        emb_matrix.append(tmp)
    emb_matrix = np.array(emb_matrix)

    for i in range(dim):
        res['{}_{}_{}'.format(prex,f + '_emb_mean',i)] = emb_matrix[:,i]
        feature.append('{}_{}_{}'.format(prex,f + '_emb_mean',i))



    joblib.dump(res[[prex,col_name] +feature ],'feats/w2v_{}_{}_emb.pkl'.format(prex,f))

In [3]:
def tfidf_char(input_values, output_num, output_prefix, seed=1024):
    tfidf_enc = TfidfVectorizer(ngram_range=(1, 4), analyzer="char_wb")
    #tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    #tfidf_enc = TfidfVectorizer()

    tfidf_vec = tfidf_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(tfidf_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_tfidf_char_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp

def tfidf_word(input_values, output_num, output_prefix, seed=1024):
    tfidf_enc = TfidfVectorizer(ngram_range=(1, 4),sublinear_tf = True)
    #tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    #tfidf_enc = TfidfVectorizer()

    tfidf_vec = tfidf_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(tfidf_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_tfidf_word_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp

def count2vec(input_values, output_num, output_prefix, seed=1024):
    #count_enc = CountVectorizer(ngram_range=(1, 3), analyzer="char_wb")
    count_enc = CountVectorizer(ngram_range=(1, 4))

    count_vec = count_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(count_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_countvec_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp


def  get_tfidf(tmp,group_id, group_target, num):
    #tmp[group_target] = tmp[group_target].apply(lambda x: ' '.join(x))
    tfidf_tmp1 = tfidf_word(tmp[group_target], num, group_target)
    #tfidf_tmp2 = tfidf_char(tmp[group_target], num, group_target)

    count_tmp = count2vec(tmp[group_target], num, group_target)
    return pd.concat([tmp[group_id], tfidf_tmp1,count_tmp], axis=1)
    #return pd.concat([tmp[group_id], tfidf_tmp], axis=1)

In [4]:
with open('../IND-WhoIsWho/pid_to_info_all.json', 'r') as file:
    pid = json.load(file)
train = pd.read_feather('data/train.feather')
valid = pd.read_feather('data/valid.feather')
test = pd.read_feather('data/test.feather')

piddf = joblib.load('data/pid_df.pkl')

data = pd.concat([train,valid,test]).reset_index(drop = True)

In [5]:
def text_clean(x,mode = 1):
    x = x.lower()
    for f in list('-?=～—/_？）￥:#\\\'.》”^>$]}|+)、（&{`《,(%!“<’"】；【‘~*@…：，。[;') :
        x = x.replace(f,'')
        
    if mode == 'venue':
        number_pattern = r'\d+'
        x = re.sub(number_pattern, '', x)
    
    
    for i in range(3):
        x = x.replace('  ',' ')
    return x
piddf['title'] = piddf['title'].apply(text_clean)
piddf['abstract'] = piddf['abstract'].apply(text_clean)

piddf['venue'] = piddf['venue'].fillna('')
piddf['venue'] = piddf['venue'].apply(lambda x:text_clean(x,mode = 'venue'))

In [6]:
piddf

Unnamed: 0,id,title,authors,abstract,keywords,venue,year
0,EgcXuw3e,fuzzy adaptive pid control of large erecting s...,"[{'name': 'Liang Li', 'org': 'Xi'an Research I...",in considering nonlinearity and uncertainty in...,"[Adaptive Control, Electro-Hydraulics, Erectin...",journal of theoretical and applied information...,2013
1,9NFbioNk,when is scene identification just texture reco...,"[{'name': 'Laura Walker Renninger', 'org': 'Ey...",subjects were asked to identify scenes after v...,"[categorization, computer vision]",vision research,2004
2,fai4LpQ3,a computational model for shape from texture,"[{'name': 'J Malik', 'org': 'Department of Ele...",shape from texture is best analysed in a twost...,"[image plane, shape]",ciba foundation symposium,1994
3,H5smGgSX,on the implicit assumptions of gans,"[{'name': 'Ke Li', 'org': ''}, {'name': 'Jiten...",generative adversarial nets gans have generate...,,arxiv learning,2018
4,8gAJkJa7,coupling visualization and data analysis for k...,"[{'name': 'Oliver Rübel', 'org': ''}, {'name':...",knowledge discovery from large and complex sci...,"[scientific data, scientific visualization, de...",iccs,2010
...,...,...,...,...,...,...,...
2952,40OWywB0,temporal variation of total gaseous mercury in...,"[{'name': 'Xinbin Feng', 'org': 'State Key Lab...",,,journal of geophysical research,2004
2953,HWscmrvG,total gaseous mercury emissions from mercuryen...,"[{'name': 'xinbin', 'org': 'chinese academy of...",,"[flux, natural source, emission, soil, mercury...",chinese journal of geochemistry,2006
2954,fHochnD0,total gaseous mercury in the atmosphere of gui...,"[{'name': 'Xinbin Feng', 'org': ''}, {'name': ...",four measurement campaigns were carried out to...,"[source, atmosphere, guiyang, mercury measurem...",the science of the total environment,2003
2955,zELKcnoE,total gaseous mercury emissions from soil in g...,"[{'name': 'Xinbin Feng', 'org': 'State Key Lab...",,,journal of geophysical research,2005


In [7]:
piddf = piddf.reset_index(drop = True)
piddf['title'] = piddf['title'].apply(lambda x:x.lower())
piddf['index'] = piddf.index + 1


In [8]:
def get_word2vec_feature(seq,emb,feat,ikx,ext='',prex = '',feature=[]):
    sentence = [[str(x) for x in x] for x in seq]
    if os.path.exists('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext)):
        model = Word2Vec.load('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext))
    else:
        model = Word2Vec(sentence, vector_size=emb, window=5, min_count=1, workers=8, epochs=10, sg=1, seed=42)
        model.save('w2v/w2v_model_{}_{}_{}.model'.format(prex,'_'.join(feat),ext))
    return model

def generate_w2v_feat(df,prex,col_name,dim):

    feature = []
    f = col_name
    dim = dim
    res = df.copy()
    
    #res[f] = df[f].apply(lambda x:x.split(' '))
    model = get_word2vec_feature(res[f].values,dim,[prex,f],f,ext='{}'.format(dim),prex = prex,feature=[])

    i = 0
    emb_matrix = []
    
    for col in tqdm(res[f].values):
        tmp = [model.wv[str(seq)] for seq in col]
        tmp = np.mean(tmp,axis = 0)
        emb_matrix.append(tmp)
    emb_matrix = np.array(emb_matrix)

    for i in range(dim):
        res['{}_{}_{}'.format(prex,f + '_emb_mean',i)] = emb_matrix[:,i]
        feature.append('{}_{}_{}'.format(prex,f + '_emb_mean',i))



    joblib.dump(res[[prex] +feature ],'w2v_feat/{}_{}_emb.pkl'.format(prex,f))

In [9]:
piddf['title'] = piddf['title'].apply(lambda x:x.split(' '))


In [10]:
generate_w2v_feat(piddf[['id','title']],'id','title',32)

100%|██████████| 2957/2957 [00:00<00:00, 26731.38it/s]


In [11]:
piddf['abstract'] = piddf['abstract'].apply(lambda x:x.lower())
piddf['abstract'] = piddf['abstract'].apply(lambda x:x.split(' '))
generate_w2v_feat(piddf,'id','abstract',32)

100%|██████████| 2957/2957 [00:00<00:00, 3751.03it/s]


In [12]:
piddf['keywords'] = piddf['keywords'].apply(lambda x:(' '.join(x)).lower())
piddf['keywords'] = piddf['keywords'].apply(lambda x:x.split(' '))
generate_w2v_feat(piddf,'id','keywords',32)

100%|██████████| 2957/2957 [00:00<00:00, 34748.16it/s]


In [13]:
piddf['venue'] = piddf['venue'].apply(lambda x:x.split(' '))
generate_w2v_feat(piddf,'id','venue',32)

100%|██████████| 2957/2957 [00:00<00:00, 44515.20it/s]


In [14]:
temp = joblib.load('w2v_feat/id_title_emb.pkl')

pid_title_dict = dict(zip(temp['id'],temp[temp.columns[1:]].values))
mean_vec = temp[temp.columns[1:]].values.mean(axis = 0)

def cos_similarity(target, embedding):
    numerator = np.sum(target * embedding, axis=1)
    denominator = np.sqrt(np.sum(np.square(target)) * np.sum(np.square(embedding),axis=1))
    return numerator / denominator
ans = []
for autherID in tqdm(data['autherID'].unique()):
    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    for f1 in pid_list:
        x1 = pid_title_dict.get(f1,mean_vec)
        x2 = [f for f in pid_list if f != f1]
        x2 = [pid_title_dict.get(f,mean_vec) for f in x2]
        ans.append(list(cos_similarity(x1, x2)))
data['title_w2v_sim'] = ans


for f in ['title_w2v_sim']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x))
    data[f + '_max'] = data[f].apply(lambda x:np.max(x))
    data[f + '_min'] = data[f].apply(lambda x:np.min(x))
    data[f + '_std'] = data[f].apply(lambda x:np.std(x))
    data[f + '_median'] = data[f].apply(lambda x:np.median(x))
data = data.drop(['autherName'],axis = 1)

data.to_feather('feats/title_w2v_sim.feather')

100%|██████████| 15/15 [00:01<00:00, 10.98it/s]


In [15]:
data = pd.concat([train,valid,test]).reset_index(drop = True)
temp = joblib.load('w2v_feat/id_abstract_emb.pkl')

pid_title_dict = dict(zip(temp['id'],temp[temp.columns[1:]].values))
mean_vec = temp[temp.columns[1:]].values.mean(axis = 0)

def cos_similarity(target, embedding):
    numerator = np.sum(target * embedding, axis=1)
    denominator = np.sqrt(np.sum(np.square(target)) * np.sum(np.square(embedding),axis=1))
    return numerator / denominator
ans = []
for autherID in tqdm(data['autherID'].unique()):
    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    for f1 in pid_list:
        x1 = pid_title_dict.get(f1,mean_vec)
        x2 = [f for f in pid_list if f != f1]
        x2 = [pid_title_dict.get(f,mean_vec) for f in x2]
        ans.append(list(cos_similarity(x1, x2)))
data['abstract_w2v_sim'] = ans


for f in ['abstract_w2v_sim']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x))
    data[f + '_max'] = data[f].apply(lambda x:np.max(x))
    data[f + '_min'] = data[f].apply(lambda x:np.min(x))
    data[f + '_std'] = data[f].apply(lambda x:np.std(x))
    data[f + '_median'] = data[f].apply(lambda x:np.median(x))
data = data.drop(['autherName'],axis = 1)

data.to_feather('feats/abstract_w2v_sim.feather')

100%|██████████| 15/15 [00:01<00:00, 10.84it/s]


In [16]:
data = pd.concat([train,valid,test]).reset_index(drop = True)
temp = joblib.load('w2v_feat/id_keywords_emb.pkl')

pid_title_dict = dict(zip(temp['id'],temp[temp.columns[1:]].values))
mean_vec = temp[temp.columns[1:]].values.mean(axis = 0)

def cos_similarity(target, embedding):
    numerator = np.sum(target * embedding, axis=1)
    denominator = np.sqrt(np.sum(np.square(target)) * np.sum(np.square(embedding),axis=1))
    return numerator / denominator
ans = []
for autherID in tqdm(data['autherID'].unique()):
    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    for f1 in pid_list:
        x1 = pid_title_dict.get(f1,mean_vec)
        x2 = [f for f in pid_list if f != f1]
        x2 = [pid_title_dict.get(f,mean_vec) for f in x2]
        ans.append(list(cos_similarity(x1, x2)))
data['keywords_w2v_sim'] = ans


for f in ['keywords_w2v_sim']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x))
    data[f + '_max'] = data[f].apply(lambda x:np.max(x))
    data[f + '_min'] = data[f].apply(lambda x:np.min(x))
    data[f + '_std'] = data[f].apply(lambda x:np.std(x))
    data[f + '_median'] = data[f].apply(lambda x:np.median(x))
data = data.drop(['autherName'],axis = 1)

data.to_feather('feats/keywords_w2v_sim.feather')

100%|██████████| 15/15 [00:01<00:00, 10.84it/s]


In [17]:
data = pd.concat([train,valid,test]).reset_index(drop = True)
temp = joblib.load('w2v_feat/id_venue_emb.pkl')

pid_title_dict = dict(zip(temp['id'],temp[temp.columns[1:]].values))
mean_vec = temp[temp.columns[1:]].values.mean(axis = 0)

def cos_similarity(target, embedding):
    numerator = np.sum(target * embedding, axis=1)
    denominator = np.sqrt(np.sum(np.square(target)) * np.sum(np.square(embedding),axis=1))
    return numerator / denominator
ans = []
for autherID in tqdm(data['autherID'].unique()):
    pid_list = data[data['autherID'] == autherID]['PID'].to_list()
    for f1 in pid_list:
        x1 = pid_title_dict.get(f1,mean_vec)
        x2 = [f for f in pid_list if f != f1]
        x2 = [pid_title_dict.get(f,mean_vec) for f in x2]
        ans.append(list(cos_similarity(x1, x2)))
data['venue_w2v_sim'] = ans


for f in ['venue_w2v_sim']:
    data[f + '_mean'] = data[f].apply(lambda x:np.mean(x))
    data[f + '_max'] = data[f].apply(lambda x:np.max(x))
    data[f + '_min'] = data[f].apply(lambda x:np.min(x))
    data[f + '_std'] = data[f].apply(lambda x:np.std(x))
    data[f + '_median'] = data[f].apply(lambda x:np.median(x))
data = data.drop(['autherName'],axis = 1)

data.to_feather('feats/venue_w2v_sim.feather')

100%|██████████| 15/15 [00:01<00:00, 10.85it/s]
