In [1]:
import os
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch

In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
print(torch.__version__)

1.10.0


In [4]:
if torch.cuda.is_available():
  device = torch.device("cuda:0")
  print("Cuda Device Available")
  print("Name of the Cuda Device: ", torch.cuda.get_device_name())
  print("GPU Computational Capablity: ", torch.cuda.get_device_capability())

Cuda Device Available
Name of the Cuda Device:  GeForce GTX 1660 Ti
GPU Computational Capablity:  (7, 5)


In [26]:
device = torch.device("cuda")
print(device)

cuda


In [6]:
df = pd.read_pickle('../dataset/SS/clean/filtered_data.pkl')

In [7]:
df.head()

Unnamed: 0_level_0,title,abstract,full_text,body_text,related_work_text,related_work_pairs,body_pairs
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
214802675,Machine Translation Pre-training for Data-to-T...,While there is a large body of research studyi...,While there is a large body of research studyi...,Data-to-Text refers to the process of generati...,Earlier work on NLG was mainly studied rulebas...,{'739696': 'Earlier work on NLG was mainly stu...,{'16946362': 'Data-to-Text refers to the proce...
859921,Summarizing Reviews with Variable-length Synta...,Abstract. We present a novel summarization fra...,Abstract. We present a novel summarization fra...,Online reviews of products and services are an...,We first look at how text excerpts are extract...,{'2481864': 'Different forms of the excerpts i...,{'13908471': 'Many automatic systems were buil...
129124,Can Document Selection Help Semi-supervised Le...,Annotating training data for event extraction ...,Annotating training data for event extraction ...,The goal of event extraction is to identify in...,Self-training has been applied to several natu...,"{'15894892': 'For event extraction, there are ...",{'15134572': 'We use a state-of-the-art Englis...
16102392,Understanding Diversity – The Impact of Person...,"Technology is becoming increasingly automated,...","Technology is becoming increasingly automated,...","On the one hand, many people publish and discu...",The effect of personality on technology accept...,"{'6742907': 'Especially, the use of social net...","{'14580473': 'In the beginning, research on te..."
7205083,Overcoming bias to learn about controversial t...,Deciding whether a claim is true or false ofte...,Deciding whether a claim is true or false ofte...,The World Wide Web has become one of the prima...,Understanding which documents people read is r...,{'154693902': 'Researchers have studied variou...,{'143523312': 'Cognitive biases and their effe...


In [8]:
df.shape

(22866, 7)

# Document embedding using Bert pre-trained model

In [10]:
#Load pre-trained BERT model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

#Change the length
sbert_model.max_seq_length = 250

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
#Generate embeddings from text
document_embeddings_abs = sbert_model.encode(df['abstract'].values)

In [15]:
document_embeddings_abs.shape

(22866, 768)

In [16]:
#Store embeddings in a dataframe column
df['BERT_embeddings'] = ''

for i in range(len(document_embeddings_abs)):
    df['BERT_embeddings'].iloc[i] = document_embeddings_abs[i, :]

In [21]:
df = df.loc[:, ["title", "abstract", "related_work_pairs", "body_pairs", "BERT_embeddings"]]

In [49]:
df.head()

Unnamed: 0_level_0,title,abstract,related_work_pairs,body_pairs,BERT_embeddings
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
214802675,Machine Translation Pre-training for Data-to-T...,While there is a large body of research studyi...,{'739696': 'Earlier work on NLG was mainly stu...,{'16946362': 'Data-to-Text refers to the proce...,"[-0.89564645, 0.8250809, 0.20701012, 0.2086090..."
859921,Summarizing Reviews with Variable-length Synta...,Abstract. We present a novel summarization fra...,{'2481864': 'Different forms of the excerpts i...,{'13908471': 'Many automatic systems were buil...,"[-0.5633038, 0.9584747, 0.4700373, 0.031486005..."
129124,Can Document Selection Help Semi-supervised Le...,Annotating training data for event extraction ...,"{'15894892': 'For event extraction, there are ...",{'15134572': 'We use a state-of-the-art Englis...,"[-0.61391443, 0.5914316, 0.24405357, 0.4846297..."
16102392,Understanding Diversity – The Impact of Person...,"Technology is becoming increasingly automated,...","{'6742907': 'Especially, the use of social net...","{'14580473': 'In the beginning, research on te...","[-0.4477767, 0.46768314, 1.118528, 0.33247223,..."
7205083,Overcoming bias to learn about controversial t...,Deciding whether a claim is true or false ofte...,{'154693902': 'Researchers have studied variou...,{'143523312': 'Cognitive biases and their effe...,"[-0.110704, 0.6546117, 0.49069276, 0.3391293, ..."


In [56]:
df.shape

(22866, 5)

In [23]:
# df.to_pickle('../dataset/SS/clean/bert_embeddings.pkl')

# Cosine similarity

In [2]:
df = pd.read_pickle('../dataset/SS/clean/bert_embeddings.pkl')

In [3]:
df.head()

Unnamed: 0_level_0,title,abstract,related_work_pairs,body_pairs,BERT_embeddings
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
214802675,Machine Translation Pre-training for Data-to-T...,While there is a large body of research studyi...,{'739696': 'Earlier work on NLG was mainly stu...,{'16946362': 'Data-to-Text refers to the proce...,"[-0.89564645, 0.8250809, 0.20701012, 0.2086090..."
859921,Summarizing Reviews with Variable-length Synta...,Abstract. We present a novel summarization fra...,{'2481864': 'Different forms of the excerpts i...,{'13908471': 'Many automatic systems were buil...,"[-0.5633038, 0.9584747, 0.4700373, 0.031486005..."
129124,Can Document Selection Help Semi-supervised Le...,Annotating training data for event extraction ...,"{'15894892': 'For event extraction, there are ...",{'15134572': 'We use a state-of-the-art Englis...,"[-0.61391443, 0.5914316, 0.24405357, 0.4846297..."
16102392,Understanding Diversity – The Impact of Person...,"Technology is becoming increasingly automated,...","{'6742907': 'Especially, the use of social net...","{'14580473': 'In the beginning, research on te...","[-0.4477767, 0.46768314, 1.118528, 0.33247223,..."
7205083,Overcoming bias to learn about controversial t...,Deciding whether a claim is true or false ofte...,{'154693902': 'Researchers have studied variou...,{'143523312': 'Cognitive biases and their effe...,"[-0.110704, 0.6546117, 0.49069276, 0.3391293, ..."


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
x = np.array([1,2,3]).reshape(1, -1)
y = np.array([4,5,6]).reshape(1, -1)
print(cosine_similarity(x, y))

[[0.97463185]]


In [5]:
def cos_check(target, df):
    # target: the document vector of target paper
    # df: dataframe of related papers' document vectors
    # output: a dictionary: cosine similarity of target paper and each paper in the database
    
    output = {}
    
    for i in df.index:
        related = df.loc[i, 'BERT_embeddings']
        target = target.reshape(1, -1)
        related = related.reshape(1, -1)
        similarity = cosine_similarity(target, related)[0][0]
        output[i] = similarity
        
    return output

In [6]:
output = cos_check(df.loc['859921', 'BERT_embeddings'], df)

In [7]:
def citation_choose(similarity_list, n):
    # Choose citation and corresponding sentences based on cosine similarity
    # n: int, choose n papers from database
    sort_list = sorted(similarity_list.items(), key=lambda x: x[1], reverse=True)
    if sort_list[0][1] == 1:
        sort_list = sort_list[1:]
        
    # Choose most similar papers based on cosine similarity
    papers = []
    for i in range(n):
        papers.append(sort_list[i][0])
    
    return papers  

In [8]:
mylist = citation_choose(output, 50)

In [9]:
df.loc[df.index.isin(mylist)].head()

Unnamed: 0_level_0,title,abstract,related_work_pairs,body_pairs,BERT_embeddings
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17206181,Transliteration Alignment,"This paper studies transliteration alignment, ...",{'1693404': 'Another technique is to directly ...,{'219895': 'It is also equally applicable for ...,"[-0.3754018, 1.2045099, 0.19640552, 0.23203138..."
32035477,A machine learning approach for result caching...,A commonly used technique for improving search...,"{'34943': 'Baeza-Yates, Saint-Jean, and de Mou...",{'5409120': 'A commonly used technique for imp...,"[-0.7328437, 0.6732816, 0.9505062, 0.24854696,..."
52139212,Maximizing Synchronization for Aligning Observ...,Abstract. Conformance checking is a branch of ...,{'14197721': 'One of the earliest works in con...,{'114711181': 'An optimal alignment using the ...,"[-0.5873668, 0.4165166, 0.6609021, 0.21944438,..."
196177814,Optimal Transport-based Alignment of Learned C...,String similarity models are vital for record ...,{'5245012': 'We note that the most relevant pr...,{'5200733': 'We evaluate the impact of using S...,"[-0.6364662, 0.7926643, 0.6266486, 0.03339096,..."
34772098,Compressing Inverted Files using Modified LZW,"In the paper, we present a compression algorit...",{'14763522': 'The most well-known methods for ...,{'10680358': 'Inverted files are considered to...,"[-0.66968054, 0.90823436, 0.73623395, 0.333349..."


In [45]:
list(df.loc[df.index.isin(mylist)].iloc[0]['related_work_pairs'].keys())

['411571',
 '1479843',
 '14295281',
 '8681118',
 '1118305',
 '207184945',
 '19258679',
 '11861526']

In [10]:
from collections import Counter

In [11]:
def get_citations(papers):
    citation = []
    for p in papers:
        cite = list(df.loc[p, 'related_work_pairs'].keys())
        for c in cite:
            citation.append(c)
            
    result = Counter(citation).most_common()[:20]
        
    return result

In [12]:
get_citations(mylist)

[('10743717', 2),
 ('799275', 2),
 ('1001779', 2),
 ('14209216', 2),
 ('6132980', 2),
 ('5724860', 2),
 ('631855', 2),
 ('14838925', 2),
 ('6334682', 1),
 ('2710961', 1),
 ('10397964', 1),
 ('10617892', 1),
 ('228061', 1),
 ('207168261', 1),
 ('2420504', 1),
 ('15363885', 1),
 ('209396626', 1),
 ('1658773', 1),
 ('8796014', 1),
 ('15917396', 1)]