In [1]:
import os
import io
import json
import pandas as pd
import numpy as np
import torch
import joblib

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from collections import Counter

# Maintext for embedding

In [4]:
df = pd.read_pickle('../dataset/SS/clean/filtered_data.pkl')

In [5]:
df.head()

Unnamed: 0_level_0,title,abstract,relatedwork_text,main_text,rw_citations,full_citations
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
859921,Summarizing Reviews with Variable-length Synta...,Abstract. We present a novel summarization fra...,We first look at how text excerpts are extract...,Abstract. We present a novel summarization fra...,"[5965756, 1599046, 3177797, 444032, 6884774]","[1599046, 11055565, 16393334, 2481864, 5965756..."
129124,Can Document Selection Help Semi-supervised Le...,Annotating training data for event extraction ...,Self-training has been applied to several natu...,Annotating training data for event extraction ...,"[1398439, 7419156, 11187670, 8336242, 15894892...","[1398439, 7419156, 7579604, 11187670, 8336242,..."
1345235,Optimizing Sparse Matrix–Vector Product Comput...,Large-scale scientific applications frequently...,A variety of different data and computation re...,Large-scale scientific applications frequently...,"[1794629, 9580801, 120335238, 15090599, 208582...","[1794629, 9580801, 120335238, 15090599, 208582..."
2624639,Enhanced Chosen-Ciphertext Security and Applic...,We introduce and study a new notion of enhance...,ECCA is similar in spirit to coin-revealing se...,We introduce and study a new notion of enhance...,"[41204165, 19655, 235427, 226828, 3148885]","[443317, 41204165, 7113862, 10098664, 226828, ..."
146120525,ARSM: Augment-REINFORCE-Swap-Merge Estimator f...,To address the challenge of backpropagating th...,"For optimizing (1) for categorical z, the diff...",To address the challenge of backpropagating th...,"[3535369, 5859948, 1758804, 10756562, 19115634...","[121929631, 4043645, 5859948, 7195970, 1075656..."


In [8]:
df.shape

(27417, 6)

# Document embedding using Bert pre-trained model

In [9]:
#Load pre-trained BERT model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

#Change the length
sbert_model.max_seq_length = 250

In [10]:
#Generate embeddings from main_text
document_embeddings_abs = sbert_model.encode(df['main_text'].values)

In [11]:
document_embeddings_abs.shape

(27417, 768)

In [12]:
#Store embeddings in a dataframe column
df['BERT_embeddings'] = ''

for i in range(len(document_embeddings_abs)):
    df['BERT_embeddings'].iloc[i] = document_embeddings_abs[i, :]

In [13]:
df.head()

Unnamed: 0_level_0,title,abstract,relatedwork_text,main_text,rw_citations,full_citations,BERT_embeddings
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
859921,Summarizing Reviews with Variable-length Synta...,Abstract. We present a novel summarization fra...,We first look at how text excerpts are extract...,Abstract. We present a novel summarization fra...,"[5965756, 1599046, 3177797, 444032, 6884774]","[1599046, 11055565, 16393334, 2481864, 5965756...","[-0.5292661, 0.96171594, 0.7239495, 0.09007428..."
129124,Can Document Selection Help Semi-supervised Le...,Annotating training data for event extraction ...,Self-training has been applied to several natu...,Annotating training data for event extraction ...,"[1398439, 7419156, 11187670, 8336242, 15894892...","[1398439, 7419156, 7579604, 11187670, 8336242,...","[-0.7070601, 0.5793047, 0.30019873, 0.451895, ..."
1345235,Optimizing Sparse Matrix–Vector Product Comput...,Large-scale scientific applications frequently...,A variety of different data and computation re...,Large-scale scientific applications frequently...,"[1794629, 9580801, 120335238, 15090599, 208582...","[1794629, 9580801, 120335238, 15090599, 208582...","[-0.72422814, 0.38941112, 0.50098884, -0.02656..."
2624639,Enhanced Chosen-Ciphertext Security and Applic...,We introduce and study a new notion of enhance...,ECCA is similar in spirit to coin-revealing se...,We introduce and study a new notion of enhance...,"[41204165, 19655, 235427, 226828, 3148885]","[443317, 41204165, 7113862, 10098664, 226828, ...","[-0.59993136, 0.84347814, 0.1958661, 0.2972666..."
146120525,ARSM: Augment-REINFORCE-Swap-Merge Estimator f...,To address the challenge of backpropagating th...,"For optimizing (1) for categorical z, the diff...",To address the challenge of backpropagating th...,"[3535369, 5859948, 1758804, 10756562, 19115634...","[121929631, 4043645, 5859948, 7195970, 1075656...","[-0.78408253, 0.022130227, 0.31313884, 0.46616..."


In [14]:
df.shape

(27417, 7)

In [15]:
#df.to_pickle('../dataset/SS/clean/bert_embeddings.pkl')

# Train/test split

In [41]:
df = pd.read_pickle('../dataset/SS/clean/bert_embeddings.pkl')
df.head()

Unnamed: 0_level_0,title,abstract,relatedwork_text,main_text,rw_citations,full_citations,BERT_embeddings
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
859921,Summarizing Reviews with Variable-length Synta...,Abstract. We present a novel summarization fra...,We first look at how text excerpts are extract...,Abstract. We present a novel summarization fra...,"[5965756, 1599046, 3177797, 444032, 6884774]","[1599046, 11055565, 16393334, 2481864, 5965756...","[-0.5292661, 0.96171594, 0.7239495, 0.09007428..."
129124,Can Document Selection Help Semi-supervised Le...,Annotating training data for event extraction ...,Self-training has been applied to several natu...,Annotating training data for event extraction ...,"[1398439, 7419156, 11187670, 8336242, 15894892...","[1398439, 7419156, 7579604, 11187670, 8336242,...","[-0.7070601, 0.5793047, 0.30019873, 0.451895, ..."
1345235,Optimizing Sparse Matrix–Vector Product Comput...,Large-scale scientific applications frequently...,A variety of different data and computation re...,Large-scale scientific applications frequently...,"[1794629, 9580801, 120335238, 15090599, 208582...","[1794629, 9580801, 120335238, 15090599, 208582...","[-0.72422814, 0.38941112, 0.50098884, -0.02656..."
2624639,Enhanced Chosen-Ciphertext Security and Applic...,We introduce and study a new notion of enhance...,ECCA is similar in spirit to coin-revealing se...,We introduce and study a new notion of enhance...,"[41204165, 19655, 235427, 226828, 3148885]","[443317, 41204165, 7113862, 10098664, 226828, ...","[-0.59993136, 0.84347814, 0.1958661, 0.2972666..."
146120525,ARSM: Augment-REINFORCE-Swap-Merge Estimator f...,To address the challenge of backpropagating th...,"For optimizing (1) for categorical z, the diff...",To address the challenge of backpropagating th...,"[3535369, 5859948, 1758804, 10756562, 19115634...","[121929631, 4043645, 5859948, 7195970, 1075656...","[-0.78408253, 0.022130227, 0.31313884, 0.46616..."


In [42]:
# Split data to 80% trainset and 20% testset
df_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Manual labels

In [39]:
# Sample pairs from trainset and testset seperately
# Manually label them

N_train = 400

df_label_train = pd.DataFrame(columns=['paper1_id', 'paper2_id', 'title1', 'title2', 'abstract1', 'abstract2'])

for i in range(N_train):
    temp = df_train.sample(n=2)
    df_label_train = df_label_train.append({'paper1_id': temp.index[0], 
                                        'title1':temp.title[0], 
                                        'abstract1':temp.abstract[0], 
                                        'paper2_id':temp.index[1], 
                                        'title2':temp.title[1], 
                                        'abstract2':temp.abstract[1]}, ignore_index=True) 
    
N_test = 100

df_label_test = pd.DataFrame(columns=['paper1_id', 'paper2_id', 'title1', 'title2', 'abstract1', 'abstract2'])

for i in range(N_test):
    temp = df_test.sample(n=2)
    df_label_test = df_label_test.append({'paper1_id': temp.index[0], 
                                        'title1':temp.title[0], 
                                        'abstract1':temp.abstract[0], 
                                        'paper2_id':temp.index[1], 
                                        'title2':temp.title[1], 
                                        'abstract2':temp.abstract[1]}, ignore_index=True)    

In [43]:
df_label_train.to_csv('label_train.csv', encoding='utf_8_sig')
df_label_train.head()

Unnamed: 0,paper1_id,paper2_id,title1,title2,abstract1,abstract2
0,7828885,208310034,Predicting Continued Participation in Online H...,Decision Propagation Networks for Image Classi...,Online health forums provide advice and emotio...,"High-level (e.g., semantic) features encoded i..."
1,4837028,16155532,Where the Truth Lies: Explaining the Credibili...,Compadres: Lightweight support for distributed...,The web is a huge source of valuable informati...,Traditional design education relies heavily on...
2,18475456,17445278,Cross-VM Cache Attacks on AES,Decision and approximation complexity for iden...,Abstract-Cache based attacks can overcome soft...,An identifying code is a subset of vertices of...
3,202775562,70299115,Reading Like HER: Human Reading Inspired Extra...,Attacking Data Transforming Learners at Traini...,"In this work, we re-examine the problem of ext...",While machine learning systems are known to be...
4,4899384,182952605,Aladdin: Automating Release of Deep-Link APIs ...,Joint Semantic Domain Alignment and Target Cla...,Compared to the Web where each web page has a ...,Unsupervised domain adaptation aims to transfe...


In [42]:
df_label_test.to_csv('label_test.csv', encoding='utf_8_sig')
df_label_test.head()

Unnamed: 0,paper1_id,paper2_id,title1,title2,abstract1,abstract2
0,53829365,5392739,Intra-class Variation Isolation in Conditional...,Grex: An Efficient MapReduce Framework for Gra...,Current state-of-the-art conditional generativ...,"In this paper, we present a new MapReduce fram..."
1,3743029,9713252,AspEm: Embedding Learning by Aspects in Hetero...,Self-updatable encryption with short public pa...,Heterogeneous information networks (HINs) are ...,Cloud storage is very popular since it has man...
2,8459419,30644086,Gazpacho and summer rash: lexical relationship...,A Co-Design Framework with OpenCL Support for ...,In this paper we investigate temporal patterns...,Energy efficiency is one of the most important...
3,11970283,6317007,Exploring Cities in Crime: Significant Concord...,Efficiency-Revenue Trade-offs in Auctions,"We present CoocViewer, a graphical analysis to...",Abstract. When agents with independent priors ...
4,2859455,16124390,Addressing GPU On-Chip Shared Memory Bank Conf...,Security and safety of assets in business proc...,Abstract One of the major problems with the GP...,Business processes and service compositions ar...


# Failure part (should be ignored)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
def cos_check(target, df):
    # target: the document vector of target paper
    # df: dataframe of related papers' document vectors
    # output: a dictionary: cosine similarity of target paper and each paper in the database
    
    output = {}
    
    for i in df.index:
        related = df.loc[i, 'BERT_embeddings']
        target = target.reshape(1, -1)
        related = related.reshape(1, -1)
        similarity = cosine_similarity(target, related)[0][0]
        output[i] = similarity
        
    return output

def citation_choose(similarity_list, n):
    # Choose citation and corresponding sentences based on cosine similarity
    # n: int, choose n papers from database
    sort_list = sorted(similarity_list.items(), key=lambda x: x[1], reverse=True)
    if sort_list[0][1] > 0.95:
        sort_list = sort_list[1:]
        
    # Choose most similar papers based on cosine similarity
    papers = []
    for i in range(n):
        papers.append(sort_list[i][0])
        
    # Choose citations from similar papers
    citation = []
    
    for p in papers:
        cite = df.loc[p, 'rw_citations']
        citation = citation + cite
            
    result = set(citation)
        
    return result

def check_recall(rw_list, db_list):
    recall = [x in db_list for x in rw_list]
    score = sum(recall) / len(recall)
    
    return score

In [12]:
score_list = []

data_sample = df.sample(frac=0.05, random_state=1)

for i in data_sample.index:
    output = cos_check(data_sample.loc[i, 'BERT_embeddings'], df)
    rw_list = data_sample.loc[i, 'rw_citations']
    db_list = citation_choose(output, 50)
    score = check_recall(rw_list, db_list)
    score_list.append(score)
    
    k = len(score_list)
    
    if k % 50 == 0:
        print(f'{k} papers done!')

50 papers done!
100 papers done!
150 papers done!
200 papers done!
250 papers done!
300 papers done!
350 papers done!
400 papers done!
450 papers done!
500 papers done!
550 papers done!
600 papers done!
650 papers done!
700 papers done!
750 papers done!
800 papers done!
850 papers done!
900 papers done!
950 papers done!
1000 papers done!
1050 papers done!
1100 papers done!
1150 papers done!
1200 papers done!
1250 papers done!
1300 papers done!
1350 papers done!


In [13]:
np.array(score_list).mean()

0.14066018187423074