In [12]:
import json


try:
    with open("galaxy_tools.json","r") as f:
        data = json.load(f)
    print("the data is loaded, the size is", len(data))
    
except FileNotFoundError:
    print("File not found")
except json.JSONDecodeError:
    print("Error: Could not decode JSON from 'galaxy_tools.json'.")



the data is loaded, the size is 7286


In [13]:
import pandas as pd


df = pd.read_csv('galaxy_tools.csv')


ID_COLUMN = 'id'
NAME_COLUMN = 'name'
DESCRIPTION_COLUMN = 'description'
SECTION_COLUMN = 'panel_section_name'
LABELS_COLUMN = 'labels'
TOPICS_COLUMN = 'edam_topics'


columns_to_process = [
    NAME_COLUMN,
    DESCRIPTION_COLUMN,
    SECTION_COLUMN,
    LABELS_COLUMN,
    TOPICS_COLUMN
]

for data in df[columns_to_process]:
    print(data if df[LABELS_COLUMN].notna else ''  )

df[columns_to_process] = df[columns_to_process].fillna(' ')

# df[columns_to_process]


name
description
panel_section_name
labels
edam_topics


In [14]:
import ast

def create_text_string(row):
    """
    Combines relevant fields into a single, descriptive string.
    Adds labels like "Name:", "Description:" to provide context for the model.
    """
    name = row[NAME_COLUMN]
    description = row[DESCRIPTION_COLUMN]
    section = row[SECTION_COLUMN]
    labels_str = row[LABELS_COLUMN]
    topics_str = row[TOPICS_COLUMN]

    text_parts = []
    
    # Add name, description, and section
    if name:
        text_parts.append(f"Tool: {name}.")
    if description:
        text_parts.append(f"Description: {description}.")
    if section:
        text_parts.append(f"Category: {section}.")

    # Process labels and topics which might be string representations of lists
    # Example: "['Text', 'File Operations']"
    # We will clean this up to be more readable: "Text, File Operations"
    if labels_str:
        try:
            # Safely evaluate the string to get a list
            labels_list = ast.literal_eval(labels_str)
            if isinstance(labels_list, list) and labels_list:
                text_parts.append(f"Labels: {', '.join(labels_list)}.")
        except (ValueError, SyntaxError):
            # If it's just a plain string, use it as is
            text_parts.append(f"Labels: {labels_str}.")

    if topics_str:
        try:
            topics_list = ast.literal_eval(topics_str)
            if isinstance(topics_list, list) and topics_list:
                text_parts.append(f"Topics: {', '.join(topics_list)}.")
        except (ValueError, SyntaxError):
            text_parts.append(f"Topics: {topics_str}.")

    return " ".join(text_parts)


In [15]:
new_text_String = df.apply(create_text_string,1)


print(new_text_String)

0       Tool: Upload File. Description: from your comp...
1       Tool: UCSC Main. Description: table browser. C...
2       Tool: UCSC Archaea. Description: table browser...
3       Tool: SRA. Description: server. Category: Get ...
4       Tool: EBI SRA. Description: ENA SRA. Category:...
                              ...                        
7281    Tool: Set External Metadata. Description:  . C...
7282    Tool: Export History. Description:  . Category...
7283    Tool: Export History to URI. Description:  . C...
7284    Tool: Import History. Description:  . Category...
7285       Tool: Data Fetch. Description:  . Category:  .
Length: 7286, dtype: object


In [None]:
data = {
    'id' : df[ID_COLUMN],
    'text_string': new_text_String
}



cleaned_df = pd.DataFrame(data)
cleaned_df.head


cleaned_df.to_csv('cleaned_data.csv',index=False)

<bound method NDFrame.head of                               id  \
0                        upload1   
1             ucsc_table_direct1   
2     ucsc_table_direct_archaea1   
3                     sra_source   
4                   ebi_sra_main   
...                          ...   
7281            __SET_METADATA__   
7282          __EXPORT_HISTORY__   
7283   __EXPORT_HISTORY_TO_URI__   
7284          __IMPORT_HISTORY__   
7285              __DATA_FETCH__   

                                            text_string  
0     Tool: Upload File. Description: from your comp...  
1     Tool: UCSC Main. Description: table browser. C...  
2     Tool: UCSC Archaea. Description: table browser...  
3     Tool: SRA. Description: server. Category: Get ...  
4     Tool: EBI SRA. Description: ENA SRA. Category:...  
...                                                 ...  
7281  Tool: Set External Metadata. Description:  . C...  
7282  Tool: Export History. Description:  . Category...  
7283  Tool: Exp