# Imports 

In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     ---------------------------------------- 0.0/86.0 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/86.0 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/86.0 kB ? eta -:--:--
     ------------- ------------------------ 30.7/86.0 kB 217.9 kB/s eta 0:00:01
     ------------------ ------------------- 41.0/86.0 kB 217.9 kB/s eta 0:00:01
     ------------------------------------ - 81.9/86.0 kB 353.1 kB/s eta 0:00:01
     ------------------------------------ - 81.9/86.0 kB 353.1 kB/s eta 0:00:01
     ------------------------------------ - 81.9/86.0 kB 353.1 kB/s eta 0:00:01
     -------------------------------------- 86.0/86.0 kB 254.9 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torchvision (from sentence-transformers)
  Downloading torchvision-0.16.2-cp311-cp311-win_amd64.whl

# Reading the DataSet

In [1]:
import pandas as pd


df = pd.read_csv('article_titles.csv')
paper_texts = df['Titles'].to_list()

In [2]:
paper_texts[:5]

[nan,
 'Clinicopathological Features of Invasive Breast Cancer: A Five-Year Retrospective Study in Southern and South-Western Ethiopia.',
 'Exploration of T cell immune responses by expression of a dominant-negative SHP1 and SHP2.',
 'First insights into region-specific lipidome alterations of prefrontal cortex and hippocampus of mice exposed chronically to microcystins.',
 'Continuous Monitoring of Health and Mobility Indicators in Patients with Cardiovascular Disease: A Review of Recent Technologies.']

# Setting the model 

In [3]:
from sentence_transformers import SentenceTransformer, util
import os
import csv
import time
import torch

if not torch.cuda.is_available():
  print("Warning: No GPU detected. Processing will be slow. Please add a GPU to this notebook")

model = SentenceTransformer('LaBSE')


print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(paper_texts[:200], show_progress_bar=True, convert_to_tensor=True)



###############################
print("Corpus loaded with {} sentences / embeddings".format(len(paper_texts[:200])))



  from .autonotebook import tqdm as notebook_tqdm


Encode the corpus. This might take a while


Batches: 100%|██████████| 7/7 [01:19<00:00, 11.39s/it]


Corpus loaded with 200 sentences / embeddings


# Search Function from the corpus

In [4]:
# Function that performs the search in the corpus and retrieve the results
def search(inp_question):
    start_time = time.time()
    question_embedding = model.encode(inp_question, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, corpus_embeddings)
    end_time = time.time()
    hits = hits[0]  #Getting the hit of the first query

    print("Input question:", inp_question)
    print("Results (after {:.3f} seconds):".format(end_time-start_time))
    for hit in hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['score'], paper_texts[hit['corpus_id']]))



#  Test 1 search English

In [5]:
search("What is the effect of Vitanim D on covid 19?")

Input question: What is the effect of Vitanim D on covid 19?
Results (after 0.633 seconds):
	0.481	Immunologic and vascular biomarkers of mortality in critical COVID-19 in a South African cohort.
	0.480	Critical COVID-19 patients through first, second, and third wave: retrospective observational study comparing outcomes in intensive care unit.
	0.467	Acute kidney disease following COVID-19 vaccination: a single-center retrospective study.
	0.436	Vitamin D status in ANCA-associated vasculitis.
	0.433	Preconception vitamin D intake and obstetric outcomes in women using assisted reproductive technology: the Japan Environment and Children's Study.
	0.429	Which vertebral level should be used to calculate sarcopenia in covid-19 patients? A systematic review and meta-analysis.
	0.404	Laboratory Findings and Biomarkers in Long COVID: What Do We Know So Far? Insights into Epidemiology, Pathogenesis, Therapeutic Perspectives and Challenges.
	0.397	Vitamin D deficiency promoting non-24 h sleep-wa

# Test 2 search German

In [6]:
search("Welche Wirkung hat Vitamin D auf Covid-19?")

Input question: Welche Wirkung hat Vitamin D auf Covid-19?
Results (after 0.314 seconds):
	0.490	Vitamin D status in ANCA-associated vasculitis.
	0.481	Immunologic and vascular biomarkers of mortality in critical COVID-19 in a South African cohort.
	0.467	Acute kidney disease following COVID-19 vaccination: a single-center retrospective study.
	0.455	Critical COVID-19 patients through first, second, and third wave: retrospective observational study comparing outcomes in intensive care unit.
	0.433	Vitamin D deficiency promoting non-24 h sleep-wake disorder: a case report.
	0.415	Implication of KDM6A in bladder cancer.
	0.410	Which vertebral level should be used to calculate sarcopenia in covid-19 patients? A systematic review and meta-analysis.
	0.409	Preconception vitamin D intake and obstetric outcomes in women using assisted reproductive technology: the Japan Environment and Children's Study.
	0.398	Laboratory Findings and Biomarkers in Long COVID: What Do We Know So Far? Insights i

# Saving and loading the model 

In [7]:
# Save the model
model_save_path = 'model_directory'
os.makedirs(model_save_path, exist_ok=True)
model.save(os.path.join(model_save_path, 'sentence_transformer_model'))

# Save the corpus embeddings
embeddings_save_path = 'model_directory/corpus_embeddings.pth'
torch.save(corpus_embeddings, embeddings_save_path)

print("Model and embeddings saved successfully.")


Model and embeddings saved successfully.


In [9]:
!pip install pyngrok

