In [None]:
 #... (NLTK downloads and UMLS API details)
# Initialize lemmatizer and stop words
#Import necessary Libraries
import nltk
import requests
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import defaultdict
from tqdm import tqdm  # Import tqdm for progress bar

import pandas as pd
import re
import csv
import os
from google.colab import drive
drive.mount('/content/drive')

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define UMLS API details
UMLS_API_KEY = "" #Enter Key here
UMLS_API_URL = "https://uts-ws.nlm.nih.gov/rest"

***-----------------------------------------------------***

**Extracting semantic features for the filtered dataset**

**--------------------------------------**
New start point: df with cuis, concept_names, uris:
**Add semantic types, relations, semantic group columns/features**

In [None]:
import pandas as pd
import ast
import torch
import requests
import csv
import os

df = pd.read_csv("./data/data.csv")
df.head(20)

Mounted at /content/drive
/content/drive/My Drive


Unnamed: 0,TEXT,ABBREV,ABBREV_CUI,abbrev_cui_semantic_types,abbrev_cui_relations,LABEL_CUI,LABEL_ENCODING,GOLD_CUI_semantic_types
0,Cost-effectiveness of pentostatin compared wit...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
1,Which role for rituximab in hairy cell leukemi...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
2,Risk of additional cancers in untreated and tr...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
3,Importance of minimal residual disease in hair...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
4,Combination therapies to improve the long-term...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
5,The biology of hairy cell leukemia.As in all m...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
6,Hairy cell leukemia responsive to anti-thymocy...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
7,T-box-expressed-in-T-cells (T-bet) expression ...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
8,Characterisation of hairy cell leukaemia by ti...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process
9,Phase II trial of recombinant immunotoxin RFB4...,hcl,"['C0023443', 'C0020259']","[['Neoplastic Process'], ['Indicator, Reagent,...","[[('mapped_to', 'Hairy cell leukemia not havin...",C0023443,57,Neoplastic Process


In [None]:

# Cache UMLS results to avoid repeated API calls
CACHE_FILE = "semantic_cash.csv"

# Load the cache from the CSV file if it exists
cuis_cache = {}
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, 'r', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader, None)  # Skip header row
        for row in csv_reader:
            if len(row) == 3:  # Avoids errors when rows have missing values
                cui, semantic_types_str, relations_str = row
                cuis_cache[cui] = (ast.literal_eval(semantic_types_str), ast.literal_eval(relations_str))


def save_cache_to_csv(cache, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["cui", "semantic_types", "relations"])  # Write header
        for cui, (semantic_types, relations) in cache.items():
            csv_writer.writerow([cui, str(semantic_types), str(relations)])



def extract_relation_info(relation_data):
    """
    Extracts additionalRelationLabel, relatedId, and relatedIdName from UMLS relation data.

    Args:
      relation_data: A list of UMLS relation dictionaries

    Returns:
        A list of tuples, each containing (additionalRelationLabel, relatedId, relatedIdName)
    """

    extracted_relations = []
    for relation in relation_data:
        try:
            relation_label = relation.get("additionalRelationLabel", None)
            related_id_name = relation.get("relatedIdName", None)

            if "additionalRelationLabel" in relation and relation['additionalRelationLabel'] in ["STY", "isa","possibly_equivalent_to","component_of",'inverse_isa','replaces','replaced_by' ]:
                 extracted_relations.append((relation_label, related_id_name)) # Extract the 2 values
        except (KeyError, TypeError) as e:
            print(f"Warning: Error extracting relations details. Error: {e}")
            continue #Skip bad entries.
    return extracted_relations


def get_hierarchical_semantic_types(cui):
    """
    Searches for concepts in UMLS Metathesaurus, using a cache to avoid redundant API calls.
    """
    semantic_types_names = []
    try:
        response = requests.get(
           f"{UMLS_API_URL}/content/2024AB/CUI/{cui}/",
            params={
               "apiKey": UMLS_API_KEY,
               "sabs": "SNOMEDCT_US" # or other specific source
                },
        )
        response.raise_for_status()
        data = response.json()
        semantic_relations = set()


        if 'result' in data and 'semanticTypes' in data['result']:
            if data['result']['semanticTypes']:
                semantic_types_names = [sty["name"] for sty in data["result"]["semanticTypes"]]


                try:
                      response2 = requests.get(
                          f"{UMLS_API_URL}/content/2024AB/CUI/{cui}/relations",
                          params={
                            "apiKey": UMLS_API_KEY,
                             "sabs": "SNOMEDCT_US" # or other specific source
                              },
                        )
                      response2.raise_for_status()
                      data2 = response2.json()
                      if "result" in data2 and isinstance(data2['result'], list):
                           semantic_relations = extract_relation_info(data2['result']) # call function to extract related info

                      else:
                           print(f"Warning: No relations returned for '{cui}'")


                except requests.exceptions.RequestException as e:
                      print(f"Error searching UMLS for CUI relations '{cui}': {e}")


            else:
                 semantic_types_names = None
                 semantic_relations = None

        return  semantic_types_names ,list(semantic_relations)

    except requests.exceptions.RequestException as e:
        print(f"Error searching UMLS for CUI Semantic_types'{cui}': {e}")
        return []




def add_semantic_types_column(df):
    """
    Adds "ABBREV_semantic_types" and "ABBREV_relations"columns to all cuis in the ABBREV column in the DataFrame
    based on the existing 'ABBREV' columns.
    """

    def get_concept_st(row):
        abbrev = row['ABBREV_CUI']
        semantic_types = []
        relations = []

        abbrev_cui_relations = []
        abbrev_cui_semantic_types = []

        for cui in ast.literal_eval(abbrev) if pd.notna(abbrev) else []:
            if cui and isinstance(cui, str):
                if cui in cuis_cache:
                    abbrev_cui_semantic_types, abbrev_cui_relations = cuis_cache[cui]

                else:
                    result = get_hierarchical_semantic_types(cui)
                    if result:
                        abbrev_cui1_semantic_types, abbrev_cui1_relations = result
                        cuis_cache[cui] = (abbrev_cui_semantic_types, abbrev_cui_relations)
                    else:
                        abbrev_cui_semantic_types, abbrev_cui_relations = [], []



        return abbrev_cui_semantic_types , abbrev_cui_relations

    df[['abbrev_cui_semantic_types', 'abbrev_cui_relations']] = df.apply(get_concept_st, axis=1, result_type='expand')
    save_cache_to_csv(cuis_cache, CACHE_FILE)
    print("Added 'ABBREV_SEMANTIC_TYPES'and 'ABBREV_CUI_RELATIONS'columns to DataFrame successfully.")
    return df


Sty_df = add_semantic_types_column(df)

# Save the updated dataframe to a new csv file
NEW_CSV_FILE_PATH = "100MeDaL_with_semantic_types_and_Relations.csv"
Sty_df.to_csv(NEW_CSV_FILE_PATH, index=False, encoding='utf-8')
print(f"Saved updated CSV to: {NEW_CSV_FILE_PATH}")
Sty_df.head(100)

**Adding the semantic types and semantic relation of the Abbrev. term and labels**

In [None]:

# Cache UMLS results to avoid repeated API calls
CACHE_FILE = "semantic_types_&relations_cach.csv"

# Load the cache from the CSV file if it exists
cuis_cache = {}
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, 'r', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader, None)  # Skip header row
        for row in csv_reader:
            if len(row) == 3:  # Avoids errors when rows have missing values
                cui, semantic_types_str, relations_str = row
                cuis_cache[cui] = (ast.literal_eval(semantic_types_str), ast.literal_eval(relations_str))


def save_cache_to_csv(cache, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["cui", "semantic_types", "relations"])  # Write header
        for cui, (semantic_types, relations) in cache.items():
            csv_writer.writerow([cui, str(semantic_types), str(relations)])


def extract_relation_info(relation_data):
    """
    Extracts additionalRelationLabel, relatedId, and relatedIdName from UMLS relation data.

    Args:
      relation_data: A list of UMLS relation dictionaries

    Returns:
        A list of tuples, each containing (additionalRelationLabel, relatedId, relatedIdName)
    """

    extracted_relations = []
    for relation in relation_data:
        try:
            relation_label = relation.get("additionalRelationLabel", None)
            related_id_name = relation.get("relatedIdName", None)

            if relation_label and related_id_name: # only stores a value if all three exist
                 extracted_relations.append((relation_label, related_id_name)) # Extract the 3 values
        except (KeyError, TypeError) as e:
            print(f"Warning: Error extracting relations details. Error: {e}")
            continue #Skip bad entries.
    return extracted_relations


def get_semantic_type(cui):
    semantic_types_names = []
    semantic_relations = []
    if cui in cuis_cache:
        semantic_types_names, semantic_relations = cuis_cache[cui]
    else:
        try:
            response = requests.get(
              f"{UMLS_API_URL}/content/2024AB/CUI/{cui}/",
                params={
                  "apiKey": UMLS_API_KEY,
                  "sabs": "SNOMEDCT_US" # or other specific source
                    },
            )
            response.raise_for_status()
            data = response.json()
            #semantic_relations = set()


            if 'result' in data and 'semanticTypes' in data['result']:
                if data['result']['semanticTypes']:
                    semantic_types_names = [sty["name"] for sty in data["result"]["semanticTypes"]] #Corrected: Uses all semantic types
                  # print(semantic_types_names)
                    try:
                          response2 = requests.get(
                              f"{UMLS_API_URL}/content/2024AB/CUI/{cui}/relations",
                              params={
                                "apiKey": UMLS_API_KEY,
                                "sabs": "SNOMEDCT_US" # or other specific source
                                  },
                            )
                          response2.raise_for_status()
                          data2 = response2.json()
                          if "result" in data2 and isinstance(data2['result'], list):
                              semantic_relations = extract_relation_info(data2['result']) # call function to extract related info

                          else:
                              print(f"Warning: No relations returned for '{cui}'")
                              semantic_relations = []


                    except requests.exceptions.RequestException as e:
                          #print(f"Error searching UMLS for CUI relations '{cui}': {e}")
                          semantic_relations = []
                else:
                  semantic_types_names = []
                  semantic_relations = []

            else:
              semantic_types_names = [] #return empty list
              semantic_relations = [] #return empty list

        except requests.exceptions.RequestException as e:
            print(f"Error searching UMLS for CUI Semantic_types'{cui}': {e}")
            semantic_types_names = [] #return empty list
            semantic_relations = [] #return empty list

    cuis_cache[cui] = semantic_types_names, list(semantic_relations)
    return  semantic_types_names ,list(semantic_relations) # Always return a tuple

def add_semantic_types_column(df):
    """
    Adds "ABBREV_semantic_types" and "ABBREV_relations"columns to all cuis in the ABBREV column in the DataFrame
    based on the existing 'ABBREV' columns.
    """

    def get_concept_st(row):
        abbrev = row['ABBREV_CUI']
        #semantic_types = []
        #relations = []

        abbrev_cui_relations = []
        abbrev_cui_semantic_types = []

        for cui in ast.literal_eval(abbrev) if pd.notna(abbrev) else []:
            if cui and isinstance(cui, str):
                if cui in cuis_cache:
                    abbrev_cui_semantic_types, abbrev_cui_relations = cuis_cache[cui]
                    #print("Retrieved from cache", cui)
                else:

                    result = get_semantic_type(cui)
                    if result:
                        abbrev_cui_semantic_types, abbrev_cui_relations = result
                        #cuis_cache[cui] = (abbrev_cui_semantic_types, abbrev_cui_relations)
                    else:
                        abbrev_cui_semantic_types, abbrev_cui_relations = [], []

        return abbrev_cui_semantic_types , abbrev_cui_relations


    df[['abbrev_cui_semantic_types', 'abbrev_cui_relations']] = df.apply(get_concept_st, axis=1, result_type='expand')

    save_cache_to_csv(cuis_cache, CACHE_FILE)
    print("Added 'ABBREV_SEMANTIC_TYPES'and 'ABBREV_CUI_RELATIONS'columns to DataFrame successfully.")
    return df


def add_gold_standard_semantic_info(df):
    """
    Adds 'GOLD_CUI_semantic_types' and 'GOLD_CUI_relations' columns to the DataFrame
    based on the existing 'GOLD_CUI' column.
    """
    def get_gold_concept_info(row):
        cui = row['LABEL_CUI']
        #cui = gold_cui[0]
        #semantic_types = []
        relations = []

        gold_cui_relations = []
        gold_cui_semantic_types = []

        #for cui in ast.literal_eval(gold_cui) if pd.notna(gold_cui) else []: # and gold_cui.startswith('[')
        #print(cui)
        if cui and isinstance(cui, str):
            if cui in cuis_cache:
                gold_cui_semantic_types, gold_cui_relations = cuis_cache[cui]
                #print("Retrieved from cache", cui)
            else:

                result = get_semantic_type(cui)
                if result:
                    gold_cui_semantic_types, gold_cui_relations = result
                    cuis_cache[cui] = (gold_cui_semantic_types, gold_cui_relations)
                else:
                    gold_cui_semantic_types, gold_cui_relations = [], []
        return pd.Series({'GOLD_CUI_relations': gold_cui_relations})


    df['GOLD_CUI_relations'] = df.apply(get_gold_concept_info, axis=1, result_type='expand')

    save_cache_to_csv(cuis_cache, CACHE_FILE) # Save cache to csv
    print("Added 'GOLD_CUI_relations' column successfully.")
    return df



#Sty_df.head(100)

#All_Sty_df = add_semantic_types_column(df)
All_Sty_Gold_df = add_gold_standard_semantic_info(df)

# Save the updated dataframe to a new csv file
NEW_CSV_FILE_PATH = "MSH_All_dataset_with_GoldRelations.csv"
All_Sty_Gold_df.to_csv(NEW_CSV_FILE_PATH, index=False, encoding='utf-8')
print(f"Saved updated CSV to: {NEW_CSV_FILE_PATH}")

#save cache to file
#save_cache_to_csv(cuis_cache, CACHE_FILE)
'''
# Save the updated dataframe to a new csv file
NEW_CSV_FILE_PATH2 = "100MeDaL_with_all_semantic_types_and_Relations__for_abbrev_label.csv"
All_Sty_Gold_df.to_csv(NEW_CSV_FILE_PATH2, index=False, encoding='utf-8')
print(f"Saved updated CSV to: {NEW_CSV_FILE_PATH2}")
'''
All_Sty_Gold_df.head(100)

**Filter sparce abbreviations**

In [None]:
#filter data fram_e with necessary values
df = pd.read_csv("./data/data.csv")
print(df.shape)
min_abbrev_example_count = 20
min_semantic_type_count = 20
min_label_semantic_type_count = 50

def filter_dataframe(df, min_abbrev_example_count, min_semantic_type_count, min_gold_semantic_type_count):
    """Filters the DataFrame based on the number of labels per abbreviation, and semantic types per (abbreviation, semantic type) pair."""

    # Filter by abbreviation count
    abbrev_counts = df['ABBREV'].value_counts()
    filtered_abbrevs = abbrev_counts[abbrev_counts >= min_abbrev_example_count].index
    filtered_df = df[df['ABBREV'].isin(filtered_abbrevs)]

    # Filter by GOLD_CUI_semantic_types count
    value_counts = filtered_df['GOLD_CUI_semantic_types'].value_counts()
    filtered_semantic_types = value_counts[value_counts >= min_gold_semantic_type_count].index
    filtered_df = filtered_df[filtered_df['GOLD_CUI_semantic_types'].isin(filtered_semantic_types)]

    # Group by abbreviation and gold semantic types, then filter
    grouped_data = filtered_df.groupby(['ABBREV', filtered_df['GOLD_CUI_semantic_types'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)]).size().reset_index(name='count')
    filtered_abbrevs_semantic = grouped_data[grouped_data['count'] <= min_semantic_type_count]
    print(filtered_abbrevs_semantic)
    abbrev_semantic_dict = {}

    for i, row in filtered_abbrevs_semantic.iterrows():
      semantic_types =[]
      if row['ABBREV'] in abbrev_semantic_dict:
        semantic_types = abbrev_semantic_dict[row['ABBREV']]
        semantic_types.append(row['GOLD_CUI_semantic_types'])
        abbrev_semantic_dict[row['ABBREV']] = semantic_types
      else:
        abbrev_semantic_dict[row['ABBREV']] = [row['GOLD_CUI_semantic_types']]

    for i, row in filtered_df.iterrows():
        if row['ABBREV'] in abbrev_semantic_dict:
           # print(row['ABBREV'], ",", row['GOLD_CUI_semantic_types'])
            #filtered_df.drop(i, inplace=True)
            if row['GOLD_CUI_semantic_types'][0] in abbrev_semantic_dict[row['ABBREV']]:
                #print(row['ABBREV'], ",", row['GOLD_CUI_semantic_types'])
                filtered_df.drop(i, inplace=True)

            else:
                continue

    #print(filtered_abbrevs_semantic)

    print("Filtered Semantic Abbrevs:")
    print(len(filtered_abbrevs_semantic))
    return filtered_df

# Filter DataFrame
filtered_df = filter_dataframe(df, min_abbrev_example_count, min_semantic_type_count, min_label_semantic_type_count)
filtered_df.to_csv("MSH_All_dataset_with_GoldRelations_filtered.csv", index=False)
print("Filtered DataFrame saved to '100filtered_Medal.csv'")
print("Shape:",filtered_df.shape)
#filtered_df.head(100)

In [None]:
filtered_df.groupby(['ABBREV', filtered_df['GOLD_CUI_semantic_types'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)]).size().reset_index(name='count')

**Dropping unnecessary columns**

In [None]:
# Drop unneeded columns
def drop_columns(df):
    columns_to_drop = ['abbrev_uri', 'label_uri'] # add columns to drop
    df = df.drop(columns=columns_to_drop, errors='ignore')  # errors = 'ignore' prevents errors if a column is already dropped
    return df

filtered_df = drop_columns(filtered_df)


print("Filtered DataFrame with semantic groups added to relations, and unnecessary columns dropped:")
filtered_df.to_csv("100MeDaL_with_all_semantic_features_filtered_columns_last.csv", index=False)
print("DataFrame with context features added")
print(filtered_df.head())

In [None]:
# Drop unneeded columns
def drop_columns(df):
    columns_to_drop = ['GOLD_CUI_relations', 'context_relations'] # add columns to drop
    df = df.drop(columns=columns_to_drop, errors='ignore')  # errors = 'ignore' prevents errors if a column is already dropped
    return df

filtered_df = drop_columns(filtered_df)


print("Filtered DataFrame with semantic groups added to relations, and unnecessary columns dropped:")
filtered_df.to_csv("100MeDaL_with_all_semantic_features_filtered_columns_last.csv", index=False)
print("DataFrame with context features added")
print(filtered_df.head())

**Adding semantic group info**

In [None]:
def load_semantic_groups(file_path="SemGroups_2018.txt"):
    """Loads semantic groups from a text file into a dictionary."""
    semantic_groups = defaultdict(list)
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split("|")
            if len(parts) == 4:
                _, group,  _ ,stype = parts # get the group and the semantic type
                semantic_groups[stype].append(group) # assign the group as a list of related semantic types
    return semantic_groups

semantic_groups = load_semantic_groups()
print("Semantic Groups Loaded!")
print(semantic_groups)

In [None]:
df.columns

In [None]:
import ast
def add_semantic_group_to_relations(df, semantic_groups):
    """Adds semantic group information to the relation columns."""
    augmented_relations_all_abbrev = []
    augmented_relations_all_context = []
    augmented_relations_all_gold = []

    for index, row in df.iterrows():

        abbrev_cui_semantic_types = ast.literal_eval(row['abbrev_cui_semantic_types']) #if isinstance(row["abbrev_cui_semantic_types"], str) and row["abbrev_cui_semantic_types"] != 'nan' else []
         # Check if 'context_semantic_types' is a string and try to convert it to a list
        ''' if isinstance(row["context_semantic_types"], str):
            try:
                context_semantic_types = ast.literal_eval(row["context_semantic_types"])
            except (SyntaxError, ValueError):
                # If conversion fails, try to extract values assuming a specific format
                # For example, if the string looks like "[['value1'], ['value2']]", you can use:
                context_semantic_types = [item.strip("[']") for item in row["context_semantic_types"][1:-1].split("', '")]

        else:
            context_semantic_types = row["context_semantic_types"] if isinstance(row["context_semantic_types"], list) else []
        '''
        context_semantic_types = ast.literal_eval(row["context_semantic_types"]) if isinstance(row["context_semantic_types"], str) and row["context_semantic_types"] != 'nan' else []
        #context_semantic_types = ast.literal_eval(row["context_semantic_types"]) #if isinstance(row["context_semantic_types"], str) and row["context_semantic_types"] != 'nan' else []
        #gold_semantic_types = ast.literal_eval(row['GOLD_CUI_semantic_types']) if pd.notna(row['GOLD_CUI_semantic_types']) else [] # get semantic types from the CUI
        gold_semantic_types = ast.literal_eval(row["GOLD_CUI_semantic_types"]) # if isinstance(row["GOLD_CUI_semantic_types"], str) and row["GOLD_CUI_semantic_types"] != 'nan' else []

        abbrev_relations = ast.literal_eval(row['abbrev_cui_relations']) #if pd.notna(row['abbrev_cui_relations']) else []
        context_relations = row["context_relations"] #if isinstance(row["context_relations"], str) and row["context_semantic_types"] != 'nan' else []
        gold_relations = row['GOLD_CUI_relations']

        # Add semantic group to abbrev relations
        augmented_relations_abbrev = []

         # Iterate through the list of semantic types
        for semantic_type in abbrev_cui_semantic_types:
          if semantic_type and isinstance(semantic_type, str) and semantic_type in semantic_groups:
              #print(semantic_type)
              for group in semantic_groups[semantic_type]:
                  augmented_relations_abbrev.append(( 'semantic_group',group))

          else:
              for i, semantic_type in enumerate(abbrev_cui_semantic_types):
                  if semantic_type:
                    for st in semantic_type:
                      # Check if the semantic type is a string and in semantic_groups
                      if isinstance(st, str) and st in semantic_groups:
                            for group in semantic_groups[st]:
                                augmented_relations_abbrev.append(('semantic_group', group))

        '''
        for semantic_type_list in abbrev_cui_semantic_types:
           # Iterate through each semantic type in the list
           for semantic_type in semantic_type_list:
              # Check if the semantic type is a string and in semantic_groups
              if isinstance(semantic_type, str) and semantic_type in semantic_groups:
                   for group in semantic_groups[semantic_type]:
                       augmented_relations_abbrev.append(('semantic_group', group))
        '''
        for i, relation in enumerate(abbrev_relations):
          if relation:# and isinstance(relation, list): # handle if the relations are an empty list or missing
             if isinstance(relation, (list, tuple)) and len(relation) >= 2:
                r, related_concept = relation[:2]  # Unpack only the first 2 elements
                augmented_relations_abbrev.append((r, related_concept))
             else:
                for r, related_concept in relation:
                    augmented_relations_abbrev.append((r,related_concept))
                    # Iterate through the list of semantic types
                    for st in related_concept:
                      # Check if the semantic type is a string and in semantic_groups
                      if isinstance(st, str) and st in semantic_groups:
                            for group in semantic_groups[st]:
                                augmented_relations_abbrev.append(('semantic_group', group))

                '''
                    if related_concept in semantic_groups:
                      for group in semantic_groups[related_concept]:
                            augmented_relations_abbrev.append(('semantic_group', group))
        for i, semantic_type in enumerate(abbrev_cui_semantic_types):
           if semantic_type and semantic_type in semantic_groups:
             for group in semantic_groups[semantic_type]:
                #print(group)
                augmented_relations_abbrev.append(( 'semantic_group',group))
        '''
        # Add semantic group to context relations
        augmented_relations_context = []
        for semantic_type in context_semantic_types:
            if semantic_type and isinstance(semantic_type, str) and semantic_type in semantic_groups:
                #print(semantic_type)
                for group in semantic_groups[semantic_type]:
                   augmented_relations_context.append(( 'semantic_group',group))

            else:
                for i, semantic_type in enumerate(context_semantic_types):
                   if semantic_type:
                    for st in semantic_type:
                      # Check if the semantic type is a string and in semantic_groups
                      if isinstance(st, str) and st in semantic_groups:
                            for group in semantic_groups[st]:
                                augmented_relations_context.append(('semantic_group', group))

        if context_relations:
          # Check if context_relations is a string representation of a list and convert it
          if isinstance(context_relations, str):
              try:
                  context_relations = ast.literal_eval(context_relations)
              except (SyntaxError, ValueError):
                  print(f"Warning: Could not parse context_relations for row {index}: {context_relations}")
                  context_relations = []  # Or handle it differently based on your data
          for relation_list in context_relations:
            # If relation_list is not iterable (e.g., a single value), skip it
             if not isinstance(relation_list, (list, tuple)):
                print(f"Warning: Skipping invalid relation_list in row {index}: {relation_list}")
                continue
                # If relation_list has only one element, treat it as the related_concept
                if len(relation_list) == 1:
                    relation = "related_to"  # Or any suitable default relation
                    related_concept = relation_list[0]
                # If relation_list has two elements, unpack them as relation and related_concept
                elif len(relation_list) >= 2:
                    relation, related_concept = relation_list[:2] # Unpack the first 2

                else:
                   for relation, related_concept in relation_list:
                        augmented_relations_context.append((relation, related_concept))

        # Add semantic group to gold relations
        augmented_relations_gold = []
        if gold_semantic_types:
         for semantic_type in gold_semantic_types:
              if semantic_type in semantic_groups:
                  for group in semantic_groups[semantic_type]:
                    augmented_relations_gold.append(("semantic_group", group))

        if gold_relations:

        # Check if gold_relations is a string representation of a list and convert it
          if isinstance(gold_relations, str):
              try:
                  gold_relations = ast.literal_eval(gold_relations)
              except (SyntaxError, ValueError):
                  print(f"Warning: Could not parse gold_relations for row {index}: {gold_relations}")
                  gold_relations = []  # Or handle it differently based on your data
          for relation_list in gold_relations:
            # If relation_list is not iterable (e.g., a single value), skip it
             if not isinstance(relation_list, (list, tuple)):
                print(f"Warning: Skipping invalid relation_list in row {index}: {relation_list}")
                continue
                # If relation_list has only one element, treat it as the related_concept
                if len(relation_list) == 1:
                    relation = "related_to"  # Or any suitable default relation
                    related_concept = relation_list[0]
                # If relation_list has two elements, unpack them as relation and related_concept
                elif len(relation_list) >= 2:
                    relation, related_concept = relation_list[:2] # Unpack the first 2

                else:
                   for relation, related_concept in relation_list:
                        augmented_relations_gold.append((relation, related_concept))

        augmented_relations_all_abbrev.append(augmented_relations_abbrev)
        augmented_relations_all_context.append(augmented_relations_context)
        augmented_relations_all_gold.append(augmented_relations_gold)

    df["abbrev_relations"] = augmented_relations_all_abbrev
    df["context_relations"] = augmented_relations_all_context
    df["gold_relations"] = augmented_relations_all_gold
    return df

# Add semantic groups to relations
filtered_df = add_semantic_group_to_relations(df, semantic_groups)
print("Filtered DataFrame with context features and semantic group relations added:")
# Save dataframe with labels
filtered_df.to_csv("MSH_All_dataset_with_GoldRelations_withcontext_SGroup.csv", index=False)
filtered_df.head(20)


In [None]:

import ast
def add_semantic_group_to_relations(df, semantic_groups):
    """Adds semantic group information to the relation columns."""
    augmented_relations_all_abbrev = []
    augmented_relations_all_context = []
    augmented_relations_all_gold = []

    for index, row in df.iterrows():

        abbrev_cui_semantic_types = ast.literal_eval(row['abbrev_cui_semantic_types']) if isinstance(row["abbrev_cui_semantic_types"], str) and row["abbrev_cui_semantic_types"] != 'nan' else []
        context_semantic_types = ast.literal_eval(row["context_semantic_types"]) if isinstance(row["context_semantic_types"], str) and row["context_semantic_types"] != 'nan' else []
        gold_semantic_types = row["GOLD_CUI_semantic_types"]# if isinstance(row["GOLD_CUI_semantic_types"], str) and row["GOLD_CUI_semantic_types"] != 'nan' else []

        abbrev_relations = ast.literal_eval(row['abbrev_cui_relations']) if isinstance(row['abbrev_cui_relations'], str) and row['abbrev_cui_relations'] != 'nan' else []
        context_relations = row["context_relations"] if isinstance(row["context_relations"], str) and row["context_relations"] != 'nan' else []
        gold_relations = row['GOLD_CUI_relations'] if isinstance(row['GOLD_CUI_relations'], str) and row['GOLD_CUI_relations'] != 'nan' else []


        # Add semantic group to abbrev relations
        augmented_relations_abbrev = []

        for semantic_type in abbrev_cui_semantic_types:
            if semantic_type and isinstance(semantic_type, str) and semantic_type in semantic_groups:
                for group in semantic_groups[semantic_type]:
                    augmented_relations_abbrev.append(('semantic_group', group))
            else:
                if isinstance(semantic_type, list):
                    for st in semantic_type:
                        if isinstance(st, str) and st in semantic_groups:
                            for group in semantic_groups[st]:
                                augmented_relations_abbrev.append(('semantic_group', group))

        for i, relation in enumerate(abbrev_relations):
            if relation:
                if isinstance(relation, (list, tuple)) and len(relation) >= 2:
                    r, related_concept = relation[:2]
                    augmented_relations_abbrev.append((r, related_concept))
                else:
                    for r, related_concept in relation:
                        augmented_relations_abbrev.append((r, related_concept))
                        if isinstance(related_concept, list):
                            for st in related_concept:
                                if isinstance(st, str) and st in semantic_groups:
                                    for group in semantic_groups[st]:
                                        augmented_relations_abbrev.append(('semantic_group', group))


        # Add semantic group to context relations
        augmented_relations_context = []
        for semantic_type in context_semantic_types:
            if semantic_type and isinstance(semantic_type, str) and semantic_type in semantic_groups:
                for group in semantic_groups[semantic_type]:
                    augmented_relations_context.append(('semantic_group', group))
            else:
                if isinstance(semantic_type, list):
                    for st in semantic_type:
                        if isinstance(st, str) and st in semantic_groups:
                            for group in semantic_groups[st]:
                                augmented_relations_context.append(('semantic_group', group))

        if context_relations:
            if isinstance(context_relations, str):
                try:
                    context_relations = ast.literal_eval(context_relations)
                except (SyntaxError, ValueError):
                    print(f"Warning: Could not parse context_relations for row {index}: {context_relations}")
                    context_relations = []
            for relation_list in context_relations:
                if not isinstance(relation_list, (list, tuple)):
                    print(f"Warning: Skipping invalid relation_list in row {index}: {relation_list}")
                    continue
                if len(relation_list) == 1:
                    relation = "related_to"
                    related_concept = relation_list[0]
                elif len(relation_list) >= 2:
                    relation, related_concept = relation_list[:2]
                else:
                    for relation, related_concept in relation_list:
                        augmented_relations_context.append((relation, related_concept))


        # Add semantic group to gold relations
        augmented_relations_gold = []
        if gold_semantic_types:
            for semantic_type in gold_semantic_types:
                if semantic_type in semantic_groups:
                    for group in semantic_groups[semantic_type]:
                        augmented_relations_gold.append(("semantic_group", group))

        if gold_relations:
            if isinstance(gold_relations, str):
                try:
                    gold_relations = ast.literal_eval(gold_relations)
                except (SyntaxError, ValueError):
                    print(f"Warning: Could not parse gold_relations for row {index}: {gold_relations}")
                    gold_relations = []
            for relation_list in gold_relations:
                if not isinstance(relation_list, (list, tuple)):
                    print(f"Warning: Skipping invalid relation_list in row {index}: {relation_list}")
                    continue
                if len(relation_list) == 1:
                    relation = "related_to"
                    related_concept = relation_list[0]
                elif len(relation_list) >= 2:
                    relation, related_concept = relation_list[:2]
                else:
                    for relation, related_concept in relation_list:
                        augmented_relations_gold.append((relation, related_concept))


        augmented_relations_all_abbrev.append(augmented_relations_abbrev)
        augmented_relations_all_context.append(augmented_relations_context)
        augmented_relations_all_gold.append(augmented_relations_gold)

    df["abbrev_relations"] = augmented_relations_all_abbrev
    df["context_relations"] = augmented_relations_all_context
    df["gold_relations"] = augmented_relations_all_gold
    return df

# Add semantic groups to relations
filtered_df = add_semantic_group_to_relations(df, semantic_groups)
print("Filtered DataFrame with context features and semantic group relations added:")
# Save dataframe with labels
filtered_df.to_csv("MSH_All_dataset_with_GoldRelations_withcontext_SGroup.csv", index=False)
filtered_df.head(20)

**Adding Gold_cui_encoding column**

In [None]:
df= pd.read_csv("MSH_All_dataset_with_GoldRelations_withcontext.csv")
def create_semantic_type_mapping(df):
    semantic_types = set()
    for index, row in df.iterrows():
         semantic_types_list = row['GOLD_CUI_semantic_types'] if pd.notna(row['GOLD_CUI_semantic_types']) else []
         semantic_types.update(semantic_types_list)

    semantic_type_mapping = {stype: index for index, stype in enumerate(semantic_types)}
    return semantic_type_mapping

def add_gold_semantic_type_encoding(df, semantic_type_mapping):
     gold_semantic_types = []
     for index, row in df.iterrows():
          semantic_types_list = row['GOLD_CUI_semantic_types'] if pd.notna(row['GOLD_CUI_semantic_types']) else [] # if pd.notna(row['GOLD_CUI_semantic_types']) else '' # get semantic types from the CUI
          semantic_type = semantic_types_list[0] if semantic_types_list else None  # Get the first one. Remove this if you want all of them
          gold_semantic_types.append(semantic_type_mapping.get(semantic_type, -1) if isinstance(semantic_type, str) else -1) # handles cases where the semantic type is not available

     df['GOLD_SEMANTIC_ENCODING'] = gold_semantic_types
     return df
semantic_type_mapping = create_semantic_type_mapping(df)
gold_df = add_gold_semantic_type_encoding(df, semantic_type_mapping)
gold_df.to_csv("MSH_All_dataset_with_GoldRelations_withcontext_last.csv", index=False)
print("Filtered DataFrame with context features and semantic group relations added:")
gold_df['GOLD_SEMANTIC_ENCODING'].value_counts()

In [None]:
gold_df[gold_df['GOLD_SEMANTIC_ENCODING']==-1]

**Adding concept names of abbreviation column to use as cui-specific in some tests**


In [None]:
df = pd.read_csv("./data/data.csv")
#df.head(20)

In [None]:
# Cache UMLS results to avoid repeated API calls
umls_cache = {}  # Dictionary to store cached UMLS results


def search_abbrev_umls(abbrev):
    """
    Searches for concepts in UMLS Metathesaurus, using a cache to avoid redundant API calls.
    """
    cuis = []
    concept_names = []
    uris = []
    term =abbrev
    for term in abbrev:
      if term != 'nan':
        if term in umls_cache:
            cui, concept_name,uri = umls_cache[term]
        else:
            try:
                response = requests.get(
                    f"{UMLS_API_URL}/search/current",
                    params={
                        "string": term,
                        "apiKey": UMLS_API_KEY,
                        # "searchType": "exact",
                    },
                )
                response.raise_for_status()
                data = response.json()

                if data["result"]["results"]:
                    cui = data["result"]["results"][0]["ui"]
                    concept_name = data["result"]["results"][0]["name"]
                    uri = data["result"]["results"][0].get("uri", None)


                else:
                    cui, concept_name, uri = None, None, None


                umls_cache[term] = (cui, concept_name, uri)  # Cache the result

            except requests.exceptions.RequestException as e:
                print(f"Error searching UMLS for abbrev '{abbrev}': {e}")
                cui, concept_name, uri = None, None, None

        #cuis.append(cui)
        concept_names.append(concept_name)
        #uris.append(uri)
      else:
        concept_names.append(None)
        #uris.append(None)
    if len(concept_names) <2:
      concept_names.append(None)
      #uris.append(None)
    return concept_names

In [None]:
def add_cui_columns(df):
    """
    Adds "EXTRACTED_ABBREV_CUI" and "LABEL_CUI" columns to the DataFrame
    based on the existing 'ABBREV' and 'LABEL' columns.
    """

    def get_cuis_concept_name(row):
        abbrev = row['ABBREV_CUI']
        #label = row['LABEL']

        #abbrev_cui = []
        #label_cui = []
        abbrev_conceptNames = []
        #label_conceptNames = []

        if abbrev:
          # Ensure abbrev is a list before calling search_abbrev_umls
            if isinstance(abbrev, str):
                try:
                    abbrev = ast.literal_eval(abbrev)
                except (SyntaxError, ValueError):
                    abbrev = [abbrev]  # Treat as single item if eval fails

                abbrev_conceptNames = search_abbrev_umls(abbrev)
                progress_bar.update(1)

            else:
                print("Not string")
                abbrev_conceptNames = search_abbrev_umls(abbrev)

                progress_bar.update(1)


        #if isinstance(label, str):
            #label_cui, label_conceptNames,label_uri = search_label_umls(label)

        return abbrev_conceptNames
        #return label_cui, label_conceptNames, label_uri

    #df[['LABEL_CUI','LABEL_CONCEPTNAME','LABEL_URI']] = df.apply(get_cuis_concept_name, axis=1, result_type='expand')
    df['concept_names'] = df.apply(get_cuis_concept_name, axis=1)
    print("Added 'concept_names' column to DataFrame successfully.")
    return df

# Add the new columns with CUI
progress_bar = tqdm(range(len(df)))
cui_df = add_cui_columns(df[:28000])

# Save the updated dataframe to a new csv file
NEW_CSV_FILE_PATH = "MSH_All_dataset_with_GoldRelations_withcontext_last.csv"
cui_df.to_csv(NEW_CSV_FILE_PATH, index=False, encoding='utf-8')
print(f"Saved updated CSV to: {NEW_CSV_FILE_PATH}")

In [None]:
cui_df.head(20)

In [None]:
df = pd.read_csv("./data/data.csv")
df['concept_names'] = [[] for _ in range(len(df))]

for index, row in df.iterrows():
  # Get the existing concept names from the 'ABBREV_CONCEPTNAME' column
  cui_concept_names = ast.literal_eval(row['ABBREV_CONCEPTNAME']) if pd.notna(row['ABBREV_CONCEPTNAME']) else []
  # Append the 'LABEL' to the list of concept names
  cui_concept_names.append(ast.literal_eval(row['LABEL_CONCEPTNAME'])[0])
  # Assign the updated list of concept names to the 'concept_names' column for the current row
  df.at[index, 'concept_names'] = cui_concept_names


# Save to file

df.to_csv("./data/data2.csv", index=False)

-------**New  Level**--------
**Semantic embedding**

**implement the semantic types embedding and semantic relations embedding using Word2Vec model**

In [None]:
import pandas as pd
from gensim.models import Word2Vec
import os
from google.colab import drive
drive.mount('/content/drive')
import ast

!pip install nltk
import nltk
nltk.download('punkt_tab')

# Define parameters
EMBEDDING_DIM = 32  # Dimension for Word2Vec embeddings
WINDOW_SIZE = 3  # Window size for Word2Vec
MIN_COUNT = 1 # Minimum count
SG = 1  # Use skip-gram architecture
CSV_FILE_PATH = "./data/data.csv"  # Path to your CSV file with semantic types and relations
OUTPUT_EMBEDDINGS_PATH = "data_semantic_type_embeddings_last.txt" # Path to save semantic type embeddings
OUTPUT_RELATIONS_PATH = "data_semantic_relation_embeddings_last.txt" # Path to save semantic relation embeddings
VOCAB_SEMANTIC_TYPES_PATH = "data_semantic_type_vocab_last.txt" #Path to save vocabulary for semantic types
VOCAB_RELATIONS_PATH = "data_semantic_relation_vocab_last.txt" #Path to save vocabulary for relations
CONTEXT_WINDOW_SIZE = 5

from nltk.tokenize import word_tokenize


def load_and_prepare_data(csv_file_path):
    """
    Loads the data from the CSV file, and create the training data for Word2Vec.

    Args:
       csv_file_path: Path to the csv file.

    Returns:
         A list of lists of strings representing the training data.
         A set with unique semantic types.
          A set with unique semantic relations.
    """

    df = pd.read_csv(csv_file_path)
    semantic_type_sentences = []
    semantic_relations_sentences = []

    all_semantic_types = set()
    all_relations = set()

    for index, row in df.iterrows():
        abbrev_cui_semantic_types = ast.literal_eval(row['abbrev_cui_semantic_types']) if pd.notna(row['abbrev_cui_semantic_types']) else []
        # Check if 'context_semantic_types' is a string and try to convert it to a list

        # context_semantic_types = ast.literal_eval(row["context_semantic_types"]) #if isinstance(row["context_semantic_types"], str) and row["context_semantic_types"] != 'nan' else []
        gold_semantic_types = row['GOLD_CUI_semantic_types'] if pd.notna(row['GOLD_CUI_semantic_types']) else [] # get semantic types from the CUI

        abbrev_relations = ast.literal_eval(row['abbrev_cui_relations']) #if pd.notna(row['abbrev_cui_relations']) else []
        #context_relations = ast.literal_eval(row["context_relations"]) if isinstance(row["context_relations"], str) and row["context_relations"] != 'nan' else []
        gold_relations = row["GOLD_CUI_relations"] if pd.notna(row["GOLD_CUI_relations"]) else []



        if abbrev_cui_semantic_types:
            semantic_type_sentences.append(abbrev_cui_semantic_types)
        #if context_semantic_types:
          #semantic_type_sentences.append(context_semantic_types)
        if gold_semantic_types:
            semantic_type_sentences.append(gold_semantic_types)


        if abbrev_relations:
            semantic_relations_sentences.extend([relation for relation in abbrev_relations])
        #if context_relations:
          #semantic_relations_sentences.extend([relation for relation in context_relations])
        if gold_relations:
            semantic_relations_sentences.extend([relation for relation in gold_relations])

        # Fix: Iterate over the lists and add individual elements to the sets
        for item in abbrev_cui_semantic_types:
            # Check if the item is a list, if so, iterate through it
            if isinstance(item, list):
                for subitem in item:
                    all_semantic_types.add(subitem)
            else:
                all_semantic_types.add(item)  # Use add instead of update

        # Similar check and handling for gold_semantic_types
        if isinstance(gold_semantic_types, list):
            for item in gold_semantic_types:
                # Check if the item is a list, if so, iterate through it
                if isinstance(item, list):
                    for subitem in item:
                        all_semantic_types.add(subitem)
                else:
                    all_semantic_types.add(item)
        else:
            all_semantic_types.add(gold_semantic_types)

        for item in abbrev_relations:
             if isinstance(item, list):
                for subitem in item:
                    all_relations.add(subitem)
             else:
                all_relations.add(item)  # Use add instead of update  # Use add instead of update
        #for item in context_relations:
            #all_relations.add(item)  # Use add instead of update
        for item in gold_relations:
            all_relations.add(item)  # Use add instead of update



    return semantic_type_sentences, all_semantic_types, semantic_relations_sentences, all_relations

def train_word2vec_embeddings(sentences, vector_size, window, min_count, sg, output_path):
    """Trains Word2Vec embeddings and saves them to a file."""
    sentences = [[str(word) for word in sentence] for sentence in sentences if sentence]
    model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window, min_count=min_count, sg=sg)
    model.wv.save_word2vec_format(output_path, binary=False) # Binary = false saves as text
    print(f"Embeddings saved to: {output_path}")
    return model

def create_vocabulary(data, output_path):
  """Creates and saves a vocabulary file"""
  with open(output_path, 'w', encoding="utf-8") as f:
    for item in data:
       f.write(f"{item}\n")
  print(f"Vocabulary saved to: {output_path}")



# Load data
semantic_type_sentences, all_semantic_types, semantic_relations_sentences, all_relations  = load_and_prepare_data(CSV_FILE_PATH)

# Train embeddings for Semantic Types
print ("Training semantic type embeddings...")
semantic_types_model = train_word2vec_embeddings(semantic_type_sentences, EMBEDDING_DIM, WINDOW_SIZE, MIN_COUNT, SG, OUTPUT_EMBEDDINGS_PATH)

# Train embeddings for Semantic Relations
print ("Training semantic relation embeddings...")
semantic_relations_model = train_word2vec_embeddings(semantic_relations_sentences, EMBEDDING_DIM, WINDOW_SIZE, MIN_COUNT, SG, OUTPUT_RELATIONS_PATH)

#Create and save vocabularies
create_vocabulary(all_semantic_types, VOCAB_SEMANTIC_TYPES_PATH)
create_vocabulary(all_relations, VOCAB_RELATIONS_PATH)


print("All training finished")