# ML Model is loaded from file, and embeddings for the entire WoS data set are created for use at search time

In [1]:
import os
from sentence_transformers import SentenceTransformer, util
import pandas as pd
model = SentenceTransformer(os.getcwd())

excel_filename = "Abstract"
abstract_df = pd.read_excel(excel_filename + ".xls")

In [2]:
# Write embeddings to file
embeddings = model.encode([str(abstract) for abstract in abstract_df.Abstract.to_list()])
with open ("Abstract_embeddings.txt", 'w') as embeddings_file:
    for embedding in embeddings:
        for item in embedding:
            embeddings_file.write(str(item)+",")
        embeddings_file.write("\n")

### Testing the search functionality based on embeddings loaded from file

In [3]:
# Load embeddings from file
embeddings = []
with open ("Abstract_embeddings.txt", 'r') as embeddings_file:
    line = embeddings_file.readline()
    while line != "":
        embeddings.append([float(item) for item in line.split(",")[:-1]])
        line = embeddings_file.readline()

In [4]:
abstract_to_search = """This bachelor’s thesis compiles EU regulations affecting product development of artificial intelligence solutions for healthcare. A literature review is performed on the regulations, and the relevant technologies and their development. The effects of these regulations are then considered in the context of a generic new product development process model.
AI is expected to bring many changes and new possibilities for the healthcare industry. The EU has recognized the potential of emerging AI technologies and their applications, as well as the risks associated with them. The general data protection regulation recently put in place by the EU to protect citizens’ privacy rights and data safety largely affects AI as a data driven technology. Regulations on medical devices are currently undergoing change as well, and affect most products placed on the market for healthcare purposes.
Dealing with regulations during development efficiently can help direct development efforts towards ideas with higher chances of success, reduce the time and costs required to demonstrate conformity, and prevent failure to comply and its legal consequences."""

abstract_to_search_embedded = model.encode([abstract_to_search])


def find_n_most_similar(abstract_to_search_embedded, embeddings_to_search, n = 5):
    similarities = []
    for abstract in embeddings_to_search:
      similarities.append(util.cos_sim(abstract_to_search_embedded, abstract))
    
    if n > len(embeddings_to_search) or n == 0:
        n = len(embeddings_to_search)
    abstracts_list = abstract_df.Abstract.to_list()
    original_abstracts_list = abstract_df.Abstract.to_list()
    most_similar_indexes = []
    print("The abstract given:")
    print(abstract_to_search)
    for i in range(n):
        print("\nThe {}. most similar abstract:".format(i + 1))
        index = similarities.index(max(similarities))
        most_similar_indexes.append(original_abstracts_list.index(abstracts_list[index]))
        print(abstracts_list[index])
        del abstracts_list[index]
        del similarities[index]
    return most_similar_indexes

In [5]:
indexes = find_n_most_similar(abstract_to_search_embedded, embeddings)

The abstract given:
This bachelor’s thesis compiles EU regulations affecting product development of artificial intelligence solutions for healthcare. A literature review is performed on the regulations, and the relevant technologies and their development. The effects of these regulations are then considered in the context of a generic new product development process model.
AI is expected to bring many changes and new possibilities for the healthcare industry. The EU has recognized the potential of emerging AI technologies and their applications, as well as the risks associated with them. The general data protection regulation recently put in place by the EU to protect citizens’ privacy rights and data safety largely affects AI as a data driven technology. Regulations on medical devices are currently undergoing change as well, and affect most products placed on the market for healthcare purposes.
Dealing with regulations during development efficiently can help direct development efforts

In [6]:
excel_filename = "WoS_All_Most_cited"
papers_df = pd.read_excel(excel_filename + ".xls")

def get_paper_by_index(df, list: indexes):
    similar_papers_df = df.iloc[indexes]
    return similar_papers_df

In [7]:
get_paper_by_index(papers_df, indexes)

Unnamed: 0.1,Unnamed: 0,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,Article Title,...,Web of Science Index,Research Areas,IDS Number,Pubmed Id,Open Access Designations,Highly Cited Status,Hot Paper Status,Date of Export,UT (Unique WOS ID),Web of Science Record
988,988,J,"Minssen, T; Gerke, S; Aboy, M; Price, N; Cohen, G",,,,"Minssen, Timo; Gerke, Sara; Aboy, Mateo; Price...",,,Regulatory responses to medical machine learning,...,Science Citation Index Expanded (SCI-EXPANDED)...,Social Sciences - Other Topics; Government & L...,XV0OD,34221415.0,"gold, Green Published",Y,N,2022-10-19,WOS:000734651600005,0
111,111,J,"Yu, KH; Beam, AL; Kohane, IS",,,,"Yu, Kun-Hsing; Beam, Andrew L.; Kohane, Isaac S.",,,Artificial intelligence in healthcare,...,Science Citation Index Expanded (SCI-EXPANDED)...,Engineering,GW4SN,31015651.0,,Y,N,2022-10-19,WOS:000446910800006,0
219,219,J,"Shamshirband, S; Fathi, M; Dehzangi, A; Chrono...",,,,"Shamshirband, Shahab; Fathi, Mahdis; Dehzangi,...",,,A review on deep learning approaches in health...,...,Science Citation Index Expanded (SCI-EXPANDED),Computer Science; Medical Informatics,QE0TL,33259944.0,Bronze,Y,N,2022-10-19,WOS:000615920400008,0
1127,127,J,"Barricelli, BR; Casiraghi, E; Fogli, D",,,,"Barricelli, Barbara Rita; Casiraghi, Elena; Fo...",,,"A Survey on Digital Twin: Definitions, Charact...",...,Science Citation Index Expanded (SCI-EXPANDED),Computer Science; Engineering; Telecommunications,KF9TR,,"gold, Green Submitted",Y,N,2022-10-19,WOS:000509585900122,0
631,631,J,"Kompa, B; Snoek, J; Beam, AL",,,,"Kompa, Benjamin; Snoek, Jasper; Beam, Andrew L.",,,Second opinion needed: communicating uncertain...,...,Science Citation Index Expanded (SCI-EXPANDED),Health Care Sciences & Services; Medical Infor...,PO5II,33402680.0,"gold, Green Published",Y,N,2022-10-19,WOS:000605202300001,0
