In [3]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch

# Step 1: Define the folder containing the text files
textbook_folder = '../data/textbooks/en'
text_files = [f for f in os.listdir(textbook_folder) if f.endswith('.txt')]

# Step 2: Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

file_names = []

if not os.path.exists('../encoded/encoded_texts.npy'):
    encoded_texts = []
    for file_name in text_files:
        file_path = os.path.join(textbook_folder, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            encoded_text = model.encode(text, convert_to_tensor=True).to(device) 
            encoded_texts.append(encoded_text)
    
    file_names = text_files
    # Convert list of tensors to a numpy array
    encoded_array = np.array([encoded_text.cpu().numpy() for encoded_text in encoded_texts])
    
    # Save the encoded values to a .npy file
    np.save('../encoded/encoded_texts.npy', encoded_array)
else:
    # Load the .npy file if it exists
    loaded_encoded_array = np.load('../encoded/encoded_texts.npy')
    # Ensure file_names is populated from the existing files
    file_names = text_files


# Convert the loaded array back to a tensor and move it to the same device
# loaded_encoded_tensor = torch.tensor(loaded_encoded_array).to(device)

# Step 5: Get user input and encode it
user_input = "What is Buprenorphine Mixture of above Buprenorphine and Naloxone?"
user_input_encoded = model.encode(user_input, convert_to_tensor=True).to(device) 

# Step 6: Calculate cosine similarity
cosine_scores = util.pytorch_cos_sim(user_input_encoded, loaded_encoded_array)

# Find the index of the most similar text
most_similar_index = cosine_scores.argmax()
most_similar_file = file_names[most_similar_index]
similarity_score = cosine_scores[0][most_similar_index].item()

# Return the similar text value along with the file name
print(f"Most similar text is in file: {most_similar_file} with a similarity score of {similarity_score:.4f}")


Most similar text is in file: Pharmacology_Katzung.txt with a similarity score of 0.4889
