In [1]:
import pandas as pd

import utils.text_processing as text_processing
import utils.subject_processing as subject_processing
import utils.summarizing as summarizing
import utils.ranking as ranking
import utils.files as files
import utils.constants as constants

In [2]:
all_papers = files.load_json(constants.PAPERS)

In [None]:
# Paper summarization

candidate_hash = "d086e479e6c4f7b7911a1a8e8e61efe191f423b4"
summarizing.summarize(candidate_hash, all_papers, "BART", "abstract")

In [2]:
embeddings, dim = text_processing.load_embeddings(constants.STARSPACE_EMBEDDINGS)

In [5]:
# Ranking by similarity

for Hash, value in all_papers.items():
    
    result = ranking.best_candidate_cosine_similarity(Hash, all_papers, embeddings, dim)

    candidate_hash = result[0]
    similarity = result[1]

    print("[paper]:\t", Hash)
    print("[title]:\t", all_papers[Hash]["title"])
    print("[similarity]:\t", similarity)
    print("[candidate]:\t", candidate_hash)
    print("[title]:\t", all_papers[candidate_hash]["title"])
    # print("[summary]:\t", summarizing.summarize(candidate_hash, all_papers, "BART", "content"), "\n")

[paper]:	 34f28bf0dfe6880a7977202160149a9b178c279b
[title]:	 PACHYCEPHALOSAURlA, A NEW SUBORDER DINOSAURS OF ORNITHISCHIAN
[similarity]:	 0.8193671176405405
[candidate]:	 c168f7fbd0694ae7ada2431d36490e57384254d5
[title]:	 Fossil birds of old Gondwanaland: a comment on drifting continents and their passengers.
[paper]:	 b799530f5b8101ad48c7019da51075413de42bc0
[title]:	 Linguistically Motivated Large-Scale NLP with C&C and Boxer
[similarity]:	 0.8337309469994829
[candidate]:	 b79a2b87d3a844c96d296783debca2d90232c1a9
[title]:	 Integrating NLP Using Linked Data
[paper]:	 2178665d086bedfc047c3b28c9aa3ec44c9df264
[title]:	 The Weights of Dinosaurs
[similarity]:	 0.8196270514576705
[candidate]:	 6bb81d84f129d3293e7c8458a10f3fcfbe499bd0
[title]:	 The Evolution of Dinosaurs
[paper]:	 e6d5b3793e7b01f8f047f9e7a9e7022f33a27524
[title]:	 Dinosaur physiology. Evidence for mesothermy in dinosaurs
[similarity]:	 0.7943086225074854
[candidate]:	 6bb81d84f129d3293e7c8458a10f3fcfbe499bd0
[title]:	 The E

In [6]:
for Hash, value in all_papers.items():

    tuples = ranking.rank_candidates_cosine_similarity(Hash, all_papers, 3, embeddings, dim, "content")

    for candidate_hash, distance in tuples:
        print("[paper]:\t", Hash)
        print("[title]:\t", all_papers[Hash]["title"])
        print("[similarity]:\t", 1 - distance)
        print("[candidate]:\t", candidate_hash)
        print("[title]:\t", all_papers[candidate_hash]["title"], "\n")

    print('==================================================')

[paper]:	 34f28bf0dfe6880a7977202160149a9b178c279b
[title]:	 PACHYCEPHALOSAURlA, A NEW SUBORDER DINOSAURS OF ORNITHISCHIAN
[similarity]:	 0.8193671176405405
[candidate]:	 c168f7fbd0694ae7ada2431d36490e57384254d5
[title]:	 Fossil birds of old Gondwanaland: a comment on drifting continents and their passengers. 

[paper]:	 34f28bf0dfe6880a7977202160149a9b178c279b
[title]:	 PACHYCEPHALOSAURlA, A NEW SUBORDER DINOSAURS OF ORNITHISCHIAN
[similarity]:	 0.755927719986424
[candidate]:	 0193a1f82481f93662aa58bf198672eecd32c377
[title]:	 Four-winged dinosaurs from China 

[paper]:	 34f28bf0dfe6880a7977202160149a9b178c279b
[title]:	 PACHYCEPHALOSAURlA, A NEW SUBORDER DINOSAURS OF ORNITHISCHIAN
[similarity]:	 0.723765031109274
[candidate]:	 6bb81d84f129d3293e7c8458a10f3fcfbe499bd0
[title]:	 The Evolution of Dinosaurs 

[paper]:	 b799530f5b8101ad48c7019da51075413de42bc0
[title]:	 Linguistically Motivated Large-Scale NLP with C&C and Boxer
[similarity]:	 0.8337309469994829
[candidate]:	 b79a2b87d3a8

In [7]:
# Ranking by subject adjusted similarity

subject_processing.update_paper_subjects(all_papers, "content")
subject_processing.update_subject_embeddings(embeddings, dim, "content")

files.save_json(constants.PAPERS, all_papers)

In [8]:
for Hash, value in all_papers.items():

    tuples = ranking.rank_candidates_cosine_similarity_adjusted(Hash, all_papers, 3, embeddings, dim, "content")

    for candidate_hash, distance in tuples:
        print("[paper]:\t", Hash)
        print("[title]:\t", all_papers[Hash]["title"])
        print("[similarity]:\t", 1 - distance)
        print("[candidate]:\t", candidate_hash)
        print("[title]:\t", all_papers[candidate_hash]["title"], "\n")

    print('==================================================')

[paper]:	 34f28bf0dfe6880a7977202160149a9b178c279b
[title]:	 PACHYCEPHALOSAURlA, A NEW SUBORDER DINOSAURS OF ORNITHISCHIAN
[similarity]:	 0.2551334073567928
[candidate]:	 0193a1f82481f93662aa58bf198672eecd32c377
[title]:	 Four-winged dinosaurs from China 

[paper]:	 34f28bf0dfe6880a7977202160149a9b178c279b
[title]:	 PACHYCEPHALOSAURlA, A NEW SUBORDER DINOSAURS OF ORNITHISCHIAN
[similarity]:	 0.24902955593496845
[candidate]:	 6bb81d84f129d3293e7c8458a10f3fcfbe499bd0
[title]:	 The Evolution of Dinosaurs 

[paper]:	 34f28bf0dfe6880a7977202160149a9b178c279b
[title]:	 PACHYCEPHALOSAURlA, A NEW SUBORDER DINOSAURS OF ORNITHISCHIAN
[similarity]:	 0.24814228637121505
[candidate]:	 1183cc30f529331c1586398a7d1dc1c6ed9a39e5
[title]:	 The origin and early evolution of dinosaurs 

[paper]:	 b799530f5b8101ad48c7019da51075413de42bc0
[title]:	 Linguistically Motivated Large-Scale NLP with C&C and Boxer
[similarity]:	 0.25833920660041887
[candidate]:	 a97c7876ebf9ae6c468db92c3c6dc1c0be832192
[title]:	 B