In [1]:
import json


try:
    with open("galaxy_tools.json","r") as f:
        data = json.load(f)
    print("the data is loaded, the size is", len(data))
    
except FileNotFoundError:
    print("File not found")
except json.JSONDecodeError:
    print("Error: Could not decode JSON from 'galaxy_tools.json'.")



the data is loaded, the size is 7286


In [2]:
import pandas as pd


df = pd.read_csv('galaxy_tools.csv')


ID_COLUMN = 'id'
NAME_COLUMN = 'name'
DESCRIPTION_COLUMN = 'description'
SECTION_COLUMN = 'panel_section_name'
LABELS_COLUMN = 'labels'
TOPICS_COLUMN = 'edam_topics'


columns_to_process = [
    NAME_COLUMN,
    DESCRIPTION_COLUMN,
    SECTION_COLUMN,
    LABELS_COLUMN,
    TOPICS_COLUMN
]

for data in df[columns_to_process]:
    print(data if df[LABELS_COLUMN].notna else ''  )

df[columns_to_process] = df[columns_to_process].fillna(' ')




name
description
panel_section_name
labels
edam_topics


In [3]:
import ast

def create_text_string(row):
    
    name = row[NAME_COLUMN]
    description = row[DESCRIPTION_COLUMN]
    section = row[SECTION_COLUMN]
    labels_str = row[LABELS_COLUMN]
    topics_str = row[TOPICS_COLUMN]

    text_parts = []
    

    if name:
        text_parts.append(f"Tool: {name}.")
    if description:
        text_parts.append(f"Description: {description}.")
    if section:
        text_parts.append(f"Category: {section}.")


    if labels_str:
        try:

            labels_list = ast.literal_eval(labels_str)
            if isinstance(labels_list, list) and labels_list:
                text_parts.append(f"Labels: {', '.join(labels_list)}.")
        except (ValueError, SyntaxError):
            
            text_parts.append(f"Labels: {labels_str}.")

    if topics_str:
        try:
            topics_list = ast.literal_eval(topics_str)
            if isinstance(topics_list, list) and topics_list:
                text_parts.append(f"Topics: {', '.join(topics_list)}.")
        except (ValueError, SyntaxError):
            text_parts.append(f"Topics: {topics_str}.")

    return " ".join(text_parts)


In [4]:
new_text_String = df.apply(create_text_string,1)


print(new_text_String)

0       Tool: Upload File. Description: from your comp...
1       Tool: UCSC Main. Description: table browser. C...
2       Tool: UCSC Archaea. Description: table browser...
3       Tool: SRA. Description: server. Category: Get ...
4       Tool: EBI SRA. Description: ENA SRA. Category:...
                              ...                        
7281    Tool: Set External Metadata. Description:  . C...
7282    Tool: Export History. Description:  . Category...
7283    Tool: Export History to URI. Description:  . C...
7284    Tool: Import History. Description:  . Category...
7285       Tool: Data Fetch. Description:  . Category:  .
Length: 7286, dtype: object


In [5]:
data = {
    'id' : df[ID_COLUMN],
    'text_string': new_text_String
}



cleaned_df = pd.DataFrame(data)
cleaned_df.head


cleaned_df.to_csv('cleaned_data.csv',index=False)

In [6]:
import chromadb
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(cleaned_df['text_string'].to_list())



In [None]:
import math
from tqdm import tqdm

chroma_client = chromadb.Client()
collection = chroma_client.create_collection('Semantic_Search')


BATCH_SIZE = 4096 


num_documents = len(cleaned_df)


print(f"Adding {num_documents} documents in batches of {BATCH_SIZE}...")
for i in tqdm(range(0, num_documents, BATCH_SIZE)):

    end_index = i + BATCH_SIZE
    batch_embeddings = embeddings[i:end_index].tolist()
    batch_metadatas = [{"id": doc_id} for doc_id in cleaned_df['id'].iloc[i:end_index].tolist()]
    batch_documents = cleaned_df['text_string'].iloc[i:end_index].tolist()
    batch_ids = cleaned_df['id'].iloc[i:end_index].tolist()

    collection.add(
        embeddings=batch_embeddings,
        metadatas=batch_metadatas,
        documents=batch_documents,
        ids=batch_ids
    )





InternalError: Collection [Galaxy_Tools] already exists

In [None]:
def print_formatted_answer(results):
    ids_list = results['ids'][0]
    docs_list = results['documents'][0]
    distances_list = results['distances'][0]

    for i in range(len(ids_list)):
        distance = distances_list[i]
        cosine_similarity = 1 - (distance**2 / 2)

        if cosine_similarity >= 0.5:
            print(f"Result {i+1}:")
            print(f"  - Tool ID: {ids_list[i]}")
            print(f"  - Content: {docs_list[i]}")
            print(f"  - Raw Distance: {distance:.4f}")
            print(f"  - Cosine Similarity: {cosine_similarity:.4f}")

            print("-" * 20)

            


In [None]:
query = "Align this novel DNA sequence against the human genome."
query_embedding = model.encode([query]).tolist()

results = collection.query(
    query_embeddings=query_embedding,
    n_results=10
)

print_formatted_answer(results)

Result 1:
  - Tool ID: toolshed.g2.bx.psu.edu/repos/iuc/progressivemauve/progressivemauve/2015_02_13.1
  - Content: Tool: progressiveMauve. Description: constructs multiple genome alignments. Category: Annotation.
  - Raw Distance: 0.7604
  - Cosine Similarity: 0.7109
--------------------
Result 2:
  - Tool ID: toolshed.g2.bx.psu.edu/repos/iuc/progressivemauve/progressivemauve/2015_02_13.0
  - Content: Tool: progressiveMauve. Description: constructs multiple genome alignments. Category: Annotation.
  - Raw Distance: 0.7604
  - Cosine Similarity: 0.7109
--------------------
Result 3:
  - Tool ID: toolshed.g2.bx.psu.edu/repos/iuc/miniprot/miniprot/0.13+galaxy0
  - Content: Tool: Miniprot align. Description: align a protein sequence against a genome with affine gap penalty, splicing and frameshift. Category: Annotation.
  - Raw Distance: 0.8321
  - Cosine Similarity: 0.6538
--------------------
Result 4:
  - Tool ID: toolshed.g2.bx.psu.edu/repos/iuc/miniprot/miniprot/0.18+galaxy0
  - Cont