# Data processing

In [38]:
import pandas as pd

In [54]:
dataset_df = pd.read_csv('medications.csv')

# Display the first 5 rows
dataset_df.head(5)

type(dataset_df)

pandas.core.frame.DataFrame

# Embedding

In [53]:
from sentence_transformers import SentenceTransformer

# Load pre-trained model and tokenizer
embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()


dataset_df["embedding"] = dataset_df["description"].apply(get_embedding)



dataset_df.head()

list

# Vector querying

In [50]:
from sklearn.metrics.pairwise import cosine_similarity


def query_system(query, df, top_n=5):
    # Get the embedding of the query
    query_embedding = get_embedding(query)

    # Compute the cosine similarity between the query and all embeddings in the dataframe
    similarities = cosine_similarity([query_embedding], df['embedding'].tolist())[0]

    # Add the similarity scores to the dataframe
    df['similarity'] = similarities

    # Sort the dataframe by similarity and return the top results
    top_results = df.sort_values(by='similarity', ascending=False).head(top_n)

    return top_results


# Query the system
query = "What is a medication that cures osteoporosis or cures fatigue?"
top_results = query_system(query, dataset_df)

top_results

Unnamed: 0,name,id,description,embedding,similarity
12,Muchiol,13,"Cures osteoporosis. Side effects: dizziness, d...","[-0.0094228470697999, -0.013953697867691517, -...",0.921086
13,Pelsiplex,14,"Cures hypertension. Side effects: fatigue, nau...","[-0.0030195650178939104, 0.0028950369451195, -...",0.851831
9,Errox,10,"Cures pneumonia. Side effects: fatigue, nausea.","[0.008347771130502224, -0.022028645500540733, ...",0.845456
1,Glucose,2,"Cures fever. Side effects: fatigue, dry mouth.","[0.0032626676838845015, -0.01925014704465866, ...",0.837479
11,Kisuel,12,"Cures arthritis. Side effects: muscle pain, fe...","[-0.0027620354667305946, -0.0041565359570086, ...",0.836944


# RAG Querying

In [51]:
from openai import OpenAI


def query_gemma(original_query: str, top_docs: pd.DataFrame):
    context = ""

    names = top_docs["name"].tolist()
    ids = top_docs["id"].tolist()
    descriptions = top_docs["description"].tolist()

    for i in range(len(names)):
        context += f"Name:{names[i]},  Id: {ids[i]}, Description: {descriptions[i]}\n"
        
    
    print(context)
        

    client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
    completion = client.chat.completions.create(
        model="local-model",
        messages=[
            {"role": "system",
             "content": f"{context}. Return the medication name on the first word of the response"},
            {"role": "user", "content": original_query}
        ],
        temperature=0.7,
    )

    return completion

In [52]:
temp = query_gemma(query, top_results)
temp

Name:Muchiol,  Id: 13, Description: Cures osteoporosis. Side effects: dizziness, dry mouth.
Name:Pelsiplex,  Id: 14, Description: Cures hypertension. Side effects: fatigue, nausea.
Name:Errox,  Id: 10, Description: Cures pneumonia. Side effects: fatigue, nausea.
Name:Glucose,  Id: 2, Description: Cures fever. Side effects: fatigue, dry mouth.
Name:Kisuel,  Id: 12, Description: Cures arthritis. Side effects: muscle pain, fever.


ChatCompletion(id='chatcmpl-8ahv4hs1p8m9xhplfj2p1', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The medication that cures osteoporosis is Muchiol, with the ID 13. It helps to cure osteoporosis by curing bone loss.', role='assistant', function_call=None, tool_calls=None))], created=1711974782, model='Loaded from Chat UI', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=26, prompt_tokens=26, total_tokens=52))