In [7]:
knowledge_base_path = '/home/stirunag/work/github/source_data/knowledge_base/'

disease_path = knowledge_base_path+'umls-2022AB-full/'



In [8]:
import csv
import gzip
from tqdm import tqdm

resources = [
    'HL7V2.5', 'ICD10AM', 'ICD10AMAE', 'LCH', 'MTHICPC2ICD10AE', 'MTHMST',
    'NCI_RENI', 'SNM', 'SNMI', 'SNOMEDCT_VET', 'FMA', 'GO', 'ICD10', 'ICD10AE',
    'ICD10CM', 'ICD9CM', 'LNC', 'MDR', 'MEDLINEPLUS', 'MSH', 'MTH', 'MTHICD9',
    'NCI', 'NCI_BRIDG', 'NCI_CDISC', 'NCI_CTCAE', 'NCI_CTEP-SDC', 'NCI_FDA',
    'NCI_NCI-GLOSS', 'NDFRT', 'OMIM', 'SNOMEDCT_US', 'WHO'
]

def modify_term(term):
    replacements = [
        '-- ',
        ' (physical finding)', ' (diagnosis)', ' (disorder)', ' (procedure)', ' (finding)',
        ' (symptom)', ' (history)', ' (treatment)', ' (manifestation)', ' [Disease/Finding]',
        ' (morphologic abnormality)', ' (etiology)', ' (observable entity)', ' (event)',
        ' (situation)', ' (___ degrees)', ' (in some patients)', ' (___ cm)', ' (___ mm)',
        ' (#___)', ' (rare)', ' (___ degree.)', ' (including anastomotic)', ' (navigational concept)',
        ' (___cm)', ' (1 patient)', ' (qualifier value)', ' (lab test)', ' (unintentional)',
        ' (tophi)', ' (NOS)', ' (___ msec)', ' (RENI)', ' (less common)', ' [as symptom]', ' (s)'
    ]
    for replacement in replacements:
        term = term.replace(replacement, '')
    term = term.replace('-', ' ')
    return term.lower()

def is_required_category(category):
    required_categories = ["T020", "T190", "T049", "T019", "T047", "T050", "T033", "T037", "T048", "T191", "T046", "T184"]
    return category in required_categories

def extract_terms_and_ids_from_umls(input_files, mrsty_file):
    """
    Extract terms and IDs from the provided UMLS into a dictionary.

    Args:
        input_files (list): List of paths to the UMLS files.
        mrsty_file (str): Path to the MRSTY file.

    Returns:
        dict: Dictionary where keys are terms and values are IDs.
    """

    term_to_id = {}

    print("Processing terms from UMLS..")

    # Read each row in MRSTY file and check for the required category
    interested_ids = set()
    with gzip.open(mrsty_file, 'rt') as file:
        reader = csv.reader(file, delimiter='|')
        for row in reader:
            if is_required_category(row[1]):
                interested_ids.add(row[0])

    # Read each row in input_files
    for filename in input_files:
        with gzip.open(filename, 'rt') as file:
            reader = csv.reader(file, delimiter='|')
            for row in tqdm(reader, desc=f"Processing {filename}"):
                if len(row) > 16:  # Ensure there are enough columns in the row
                    if (row and len(row[14]) > 3 and row[1] == "ENG" and row[16] != "0" and
                            row[11] in resources and row[0] in interested_ids):
                        term = modify_term(row[14])
                        term_to_id[term] = row[0]

    return term_to_id

In [9]:
input_files = [disease_path + 'MRCONSO.RRF.aa.gz', disease_path + 'MRCONSO.RRF.ab.gz']
mrsty_file = disease_path + "MRSTY.RRF.gz"

term_id_dict = extract_terms_and_ids_from_umls(input_files, mrsty_file)

Processing terms from UMLS..


Processing /home/stirunag/work/github/source_data/knowledge_base/umls-2022AB-full/MRCONSO.RRF.aa.gz: 8607058it [00:18, 475007.92it/s]
Processing /home/stirunag/work/github/source_data/knowledge_base/umls-2022AB-full/MRCONSO.RRF.ab.gz: 8154375it [00:19, 424103.21it/s]


In [10]:
term_id_dict

{'abdomen, acute': 'C0000727',
 'abdomens, acute': 'C0000727',
 'acute abdomen': 'C0000727',
 'acute abdomens': 'C0000727',
 'acute abdomen, nos': 'C0000727',
 'abdomen   acute': 'C0000727',
 'syndrome abdominal acute': 'C0000727',
 'abdominal syndrome acute': 'C0000727',
 'acute abdominal pain syndrome': 'C0000727',
 'acute abdominal pain syndrome, nos': 'C0000727',
 '[d]acute abdomen': 'C0000727',
 '[d]acute abdomen (context dependent category)': 'C0000727',
 'surgical abdomen': 'C0000727',
 'abdominal cramps': 'C0000729',
 'abdominal cramp': 'C0000729',
 'cramp abdominal': 'C0000729',
 'cramp, abdominal': 'C0000729',
 'cramps, abdominal': 'C0000729',
 'abdominal crampy pains': 'C0000729',
 'griping abdominal': 'C0000729',
 '[d]abdominal cramps': 'C0000729',
 'griping abdomen': 'C0000729',
 '[d]abdominal cramps (context dependent category)': 'C0000729',
 'abdomen distended': 'C0000731',
 'distended abdomen': 'C0000731',
 'abdominal distention': 'C0000731',
 'abdominal distension': 'C

In [76]:
import pickle
import numpy as np
import spacy
import faiss
from tqdm import tqdm
import warnings
import gc
from rapidfuzz import process, fuzz
from sklearn.metrics.pairwise import cosine_similarity
from scispacy.abbreviation import AbbreviationDetector

warnings.simplefilter("ignore")

# Function to create quantized Faiss index
def create_quantized_index(embeddings_np, d):
    nlist = 1000
    m = 30
    quantizer = faiss.IndexFlatL2(d)
    index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
    index.train(embeddings_np)
    return index

# Function to calculate mean vector from the model's vocabulary
def calculate_mean_vector(model):
    vectors = [word.vector for word in model.vocab if word.has_vector]
    if vectors:
        mean_vector = np.mean(vectors, axis=0)
        return mean_vector
    else:
        return np.zeros((300,))

# Function to get average embeddings for terms in batches using the user's model, with mean vector fallback
def get_average_embeddings_batched(terms, model, mean_vector):
    docs = list(model.pipe(terms))
    embeddings = []
    for doc in docs:
        valid_vectors = [token.vector for token in doc if token.has_vector and token.vector.shape[0] == 300]
        if len(valid_vectors) == 0:
            embeddings.append(mean_vector)
        else:
            average_embedding = np.mean(valid_vectors, axis=0)
            embeddings.append(average_embedding)
    return embeddings

# Preprocessing and indexing function to create Faiss index and save necessary data
def preprocess_and_index(term_id_dict, output_pickle_filename, output_list, faiss_index_filename, model_path, batch_size=10000):
    # Load your custom embedding model
    nlp_model = spacy.load(model_path)
    mean_vector = calculate_mean_vector(nlp_model)

    embeddings = []
    indexed_terms = []

    terms = list(term_id_dict.keys())
    ids = list(term_id_dict.values())

    for idx in tqdm(range(0, len(terms), batch_size), desc="Generating Embeddings"):
        term_batch = terms[idx: idx + batch_size]
        id_batch = ids[idx: idx + batch_size]

        batch_embeddings = get_average_embeddings_batched(term_batch, nlp_model, mean_vector)

        for term, term_id, embedding in zip(term_batch, id_batch, batch_embeddings):
            norm = np.linalg.norm(embedding)
            if norm == 0:
                print(f"Term '{term}' with ID '{term_id}' has a zero vector.")
            normalized_embedding = embedding if norm == 0 else embedding / norm
            embeddings.append(normalized_embedding)
            indexed_terms.append(term)
        gc.collect()

    d = 300
    embeddings_np = np.array(embeddings).astype('float32')
    index = create_quantized_index(embeddings_np, d)
    index.add(embeddings_np)
    del embeddings, embeddings_np
    gc.collect()

    print("Saving quantized faiss index...")
    faiss.write_index(index, faiss_index_filename)

    print("Saving term to ID mapping and indexed terms...")
    with open(output_pickle_filename, "wb") as outfile:
        pickle.dump({"term_to_id": term_id_dict, "indexed_terms": indexed_terms}, outfile)

    print("Writing terms to a txt file...")
    with open(output_list, "w") as txt_file:
        for term in term_id_dict.keys():
            txt_file.write(term + "\n")

# Function to load Faiss index and mappings
def load_faiss_and_mappings(faiss_index_filename, pickle_filename):
    index = faiss.read_index(faiss_index_filename)
    with open(pickle_filename, "rb") as infile:
        data = pickle.load(infile)
    return index, data["term_to_id"], data["indexed_terms"]

# Functions for exact, fuzzy, and embedding-based matching
def get_exact_match(term, term_dict):
    return term_dict.get(term)

def get_fuzzy_match(term, term_dict, threshold=70):
    result = process.extractOne(term, term_dict.keys(), scorer=fuzz.ratio)
    if result:
        match, score, _ = result
        if score >= threshold:
            return term_dict[match]
    return None

def get_embedding_match(term, index, indexed_terms, term_dict, model, threshold=0.7):
    term_vector = model(term).vector.reshape(1, -1).astype('float32')
    faiss.normalize_L2(term_vector)
    _, I = index.search(term_vector, 1)
    if I[0][0] != -1:
        matched_term = indexed_terms[I[0][0]]
        similarity = cosine_similarity(term_vector, model(matched_term).vector.reshape(1, -1))[0][0]
        if similarity >= threshold:
            # Ensure that the matched term exists in term_dict
            return term_dict.get(matched_term, "No Match")
    return None

def expand_abbreviations(term, model):
    abbreviation_detector = AbbreviationDetector(model)
    model.add_pipe(abbreviation_detector, last=True)
    doc = model(term)
    expanded_term = []
    for token in doc:
        if token._.long_form:
            expanded_term.append(token._.long_form)
        else:
            expanded_term.append(token.text)
    return " ".join(expanded_term)

def map_terms(entities, term_dict, index, indexed_terms, model, scispacy_model):
    """Map new entities using exact, fuzzy, and embedding matches, with abbreviation fallback."""
    mapped_entities = {}
    for entity in entities:
        # Step 1: Initial matching
        match = get_exact_match(entity, term_dict)
        if not match:
            match = get_fuzzy_match(entity, term_dict)
        if not match:
            match = get_embedding_match(entity, index, indexed_terms, term_dict, model)

        # Step 2: Abbreviation Expansion and Retry if no match
        if not match:
            expanded_entity = expand_abbreviations(entity, scispacy_model)
            if expanded_entity != entity:  # Only retry if abbreviation expanded
                match = get_exact_match(expanded_entity, term_dict)
                if not match:
                    match = get_fuzzy_match(expanded_entity, term_dict)
                if not match:
                    match = get_embedding_match(expanded_entity, index, indexed_terms, term_dict, model)

        mapped_entities[entity] = match if match else "No Match"
    return mapped_entities



In [61]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz

In [62]:
# Step 1: Preprocess and Index
spacy_path =   "/home/stirunag/work/github/CAPITAL/normalisation/en_floret_model" # "en_core_sci_md"en_core_sci_
preprocess_and_index(term_id_dict, "term_mapping.pkl", "indexed_terms.txt", "faiss_index.idx", spacy_path)



Generating Embeddings: 100%|████████████████████████████████████████████████████████████████| 82/82 [04:23<00:00,  3.21s/it]


Saving quantized faiss index...
Saving term to ID mapping and indexed terms...
Writing terms to a txt file...


In [85]:
# Step 3: Perform Mappings
new_entities = ['abdominal lump, nos', 'abdominal tumor', 'unknown term', 'T2DM', "asthma", "acute bronchitis"]
nlp_model = spacy.load(spacy_path) # spacy.load("en_core_sci_lg")  # Correctly load the SciSpaCy model
scispacy_model = spacy.load("en_core_sci_md")  # SciSpaCy model for abbreviation expansion

In [86]:
mapped_entities = map_terms(new_entities, term_dict, index, indexed_terms, nlp_model, scispacy_model)
mapped_entities

{'abdominal lump, nos': 'C0000734',
 'abdominal tumor': 'C0000735',
 'unknown term': 'C3846629',
 'T2DM': 'C0362046',
 'asthma': 'C0004096',
 'acute bronchitis': 'C0149514'}

In [88]:
cui = "C0149514"
# Reverse the dictionary to find terms by their CUI
id_to_term = {v: k for k, v in term_id_dict.items()}
term = id_to_term.get(cui, "Unknown CUI")
print(f"The term associated with '{cui}' is: {term}")

The term associated with 'C0149514' is: acute chest infections


In [78]:
import random

# Original term2dict dictionary for reference
term2dict = {
    'abdominal lump, nos': 'C0000734',
    '[d]abdominal lump': 'C0000734',
    '[d]abdominal mass': 'C0000734',
    'mass in abdomen': 'C0000734',
    'abdominal mass [ambiguous]': 'C0000734',
    '[d]abdominal lump (context dependent category)': 'C0000734',
    '[d]abdominal mass (context dependent category)': 'C0000734',
    'abdominal neoplasms': 'C0000735',
    'abdominal neoplasm': 'C0000735',
    'neoplasm, abdominal': 'C0000735',
}

# Function to create a modified version of a term
def modify_term(term):
    words = term.split()
    
    # Choose a random modification: replace a word, jumble, or introduce a typo
    modification_type = random.choice(['replace', 'jumble', 'typo'])
    
    if modification_type == 'replace' and len(words) > 1:
        # Replace a word with a similar word or synonym
        synonyms = {
            'abdominal': 'belly',
            'mass': 'lump',
            'neoplasms': 'tumors',
            'neoplasm': 'tumor',
            'lump': 'swelling',
            'mass': 'growth',
        }
        word_to_replace = random.choice(words)
        if word_to_replace in synonyms:
            replacement = synonyms[word_to_replace]
            words[words.index(word_to_replace)] = replacement
    
    elif modification_type == 'jumble' and len(words) > 1:
        # Randomly shuffle the words
        random.shuffle(words)
    
    elif modification_type == 'typo':
        # Introduce a small typo in a random word
        word_to_modify = random.choice(words)
        if len(word_to_modify) > 1:
            index = random.randint(0, len(word_to_modify) - 1)
            typo_word = list(word_to_modify)
            typo_word[index] = random.choice('abcdefghijklmnopqrstuvwxyz')
            words[words.index(word_to_modify)] = ''.join(typo_word)
    
    return ' '.join(words)

# Create a test set by modifying the terms in the dictionary
test_set = {}
for term, term_id in term2dict.items():
    modified_term = modify_term(term)
    test_set[modified_term] = term_id

print("Generated Test Set:")
for original, modified in zip(term2dict.keys(), test_set.keys()):
    print(f"Original: {original} --> Modified: {modified}")


Generated Test Set:
Original: abdominal lump, nos --> Modified: abdominal lump, nos
Original: [d]abdominal lump --> Modified: lump [d]abdominal
Original: [d]abdominal mass --> Modified: [d]abdominal mass
Original: mass in abdomen --> Modified: abdomen mass in
Original: abdominal mass [ambiguous] --> Modified: abdominal mass [ambiguzus]
Original: [d]abdominal lump (context dependent category) --> Modified: [d]abdominal lump (context dependent catelory)
Original: [d]abdominal mass (context dependent category) --> Modified: mass dependent [d]abdominal (context category)
Original: abdominal neoplasms --> Modified: abdominal tumors
Original: abdominal neoplasm --> Modified: neoplasm abdominal
Original: neoplasm, abdominal --> Modified: neoplasm, abdominal


In [79]:
# Use the generated test set as input to evaluate performance
test_terms = list(test_set.keys())
mapped_results = map_terms(test_terms, term2dict, index, indexed_terms, nlp_model, scispacy_model)

# Compare the mapped results with the correct IDs from the test set
correct_mappings = 0
total_mappings = len(test_terms)

print("\nMapping Performance:")
for test_term, original_id in test_set.items():
    predicted_id = mapped_results[test_term]
    if predicted_id == original_id:
        correct_mappings += 1
        result = "Correct"
    else:
        result = "Incorrect"
    print(f"Test Term: {test_term} | Predicted ID: {predicted_id} | Expected ID: {original_id} | {result}")

accuracy = correct_mappings / total_mappings * 100
print(f"\nAccuracy: {accuracy:.2f}%")



Mapping Performance:
Test Term: abdominal lump, nos | Predicted ID: C0000734 | Expected ID: C0000734 | Correct
Test Term: lump [d]abdominal | Predicted ID: C0000734 | Expected ID: C0000734 | Correct
Test Term: [d]abdominal mass | Predicted ID: C0000734 | Expected ID: C0000734 | Correct
Test Term: abdomen mass in | Predicted ID: C0000734 | Expected ID: C0000734 | Correct
Test Term: abdominal mass [ambiguzus] | Predicted ID: C0000734 | Expected ID: C0000734 | Correct
Test Term: [d]abdominal lump (context dependent catelory) | Predicted ID: C0000734 | Expected ID: C0000734 | Correct
Test Term: mass dependent [d]abdominal (context category) | Predicted ID: C0000734 | Expected ID: C0000734 | Correct
Test Term: abdominal tumors | Predicted ID: C0000734 | Expected ID: C0000735 | Incorrect
Test Term: neoplasm abdominal | Predicted ID: C0000735 | Expected ID: C0000735 | Correct
Test Term: neoplasm, abdominal | Predicted ID: C0000735 | Expected ID: C0000735 | Correct

Accuracy: 90.00%


In [10]:
import csv
import gzip
from tqdm import tqdm

resources = [
    'HL7V2.5', 'ICD10AM', 'ICD10AMAE', 'LCH', 'MTHICPC2ICD10AE', 'MTHMST',
    'NCI_RENI', 'SNM', 'SNMI', 'SNOMEDCT_VET', 'FMA', 'GO', 'ICD10', 'ICD10AE',
    'ICD10CM', 'ICD9CM', 'LNC', 'MDR', 'MEDLINEPLUS', 'MSH', 'MTH', 'MTHICD9',
    'NCI', 'NCI_BRIDG', 'NCI_CDISC', 'NCI_CTCAE', 'NCI_CTEP-SDC', 'NCI_FDA',
    'NCI_NCI-GLOSS', 'NDFRT', 'OMIM', 'SNOMEDCT_US', 'WHO'
]

def modify_term(term):
    replacements = [
        '-- ', ' (physical finding)', ' (diagnosis)', ' (disorder)', ' (procedure)', ' (finding)',
        ' (symptom)', ' (history)', ' (treatment)', ' (manifestation)', ' [Disease/Finding]',
        ' (morphologic abnormality)', ' (etiology)', ' (observable entity)', ' (event)',
        ' (situation)', ' (___ degrees)', ' (in some patients)', ' (___ cm)', ' (___ mm)',
        ' (#___)', ' (rare)', ' (___ degree.)', ' (including anastomotic)', ' (navigational concept)',
        ' (___cm)', ' (1 patient)', ' (qualifier value)', ' (lab test)', ' (unintentional)',
        ' (tophi)', ' (NOS)', ' (___ msec)', ' (RENI)', ' (less common)', ' [as symptom]', ' (s)'
    ]
    for replacement in replacements:
        term = term.replace(replacement, '')
    term = term.replace('-', ' ')
    return term.lower()

def is_required_category(category):
    required_categories = ["T020", "T190", "T049", "T019", "T047", "T050", "T033", "T037", "T048", "T191", "T046", "T184"]
    return category in required_categories

def extract_terms_and_ids_from_umls(input_files, mrsty_file):
    """
    Extract terms and IDs from the provided UMLS into a dictionary.

    Args:
        input_files (list): List of paths to the UMLS files.
        mrsty_file (str): Path to the MRSTY file.

    Returns:
        dict: Dictionary where keys are terms and values are IDs.
    """

    term_to_id = {}
    abbreviation_to_longform = {}

    print("Processing terms from UMLS..")

    # Read each row in MRSTY file and check for the required category
    interested_ids = set()
    with gzip.open(mrsty_file, 'rt') as file:
        reader = csv.reader(file, delimiter='|')
        for row in reader:
            if is_required_category(row[1]):
                interested_ids.add(row[0])

    # Read each row in input_files
    for filename in input_files:
        with gzip.open(filename, 'rt') as file:
            reader = csv.reader(file, delimiter='|')
            for row in tqdm(reader, desc=f"Processing {filename}"):
                if len(row) > 16:
                    # Check for English terms, preferred terms (PT) and abbreviations (AB)
                    cui = row[0]
                    term = modify_term(row[14])
                    term_type = row[12]
                    if row[1] == "ENG" and row[16] != "0" and row[11] in resources and cui in interested_ids:
                        # Store preferred terms as long forms
                        if term_type == "PT":
                            term_to_id[term] = cui
                        # Store abbreviations and link them to their long forms using the same CUI
                        elif term_type == "AB" and cui in term_to_id:
                            abbreviation_to_longform[term] = term_to_id[cui]

    # Combine term_to_id with abbreviations
    term_to_id.update(abbreviation_to_longform)
    return term_to_id, abbreviation_to_longform




In [11]:
knowledge_base_path = '/home/stirunag/work/github/source_data/knowledge_base/'

disease_path = knowledge_base_path+'umls-2022AB-full/'

In [14]:
input_files = [disease_path + 'MRCONSO.RRF.aa.gz', disease_path + 'MRCONSO.RRF.ab.gz']
mrsty_file = disease_path + "MRSTY.RRF.gz"

term_id_dict, abbr_dict = extract_terms_and_ids_from_umls(input_files, mrsty_file)

Processing terms from UMLS..


Processing /home/stirunag/work/github/source_data/knowledge_base/umls-2022AB-full/MRCONSO.RRF.aa.gz: 8607058it [00:43, 198039.56it/s]
Processing /home/stirunag/work/github/source_data/knowledge_base/umls-2022AB-full/MRCONSO.RRF.ab.gz: 8154375it [00:44, 183224.64it/s]


In [15]:
term_id_dict

{'acute abdomen, nos': 'C0000727',
 'acute abdomen': 'C0000727',
 'abdominal cramp': 'C0000729',
 'abdominal cramps': 'C0000729',
 'abdominal distention': 'C0000731',
 'abdominal distension': 'C0000731',
 'swollen abdomen': 'C0000731',
 'abdomen distention': 'C0000731',
 'abdominal mass': 'C0000734',
 'abdominal mass, nos': 'C0000734',
 'abdominal neoplasms': 'C0000735',
 'abdominal neoplasm': 'C0000735',
 'neoplasm of abdomen': 'C0000735',
 'abdominal pain': 'C0000737',
 'abdominal pain, unspecified site': 'C0000737',
 'unspecified abdominal pain': 'C0000737',
 'abetalipoproteinemia': 'C0000744',
 'abetalipoproteinaemia': 'C0000744',
 'congenital abnormal fusion': 'C0000754',
 'abnormal fusion, congenital': 'C0000754',
 'congenital abnormal fusion, nos': 'C0000754',
 'excessive body weight gain': 'C0000765',
 'body weight gain, excessive': 'C0000765',
 'excessive weight gain': 'C0000765',
 'x ray abnormal': 'C0000766',
 'abnormal x ray': 'C0000766',
 'congenital abnormality': 'C000076

In [17]:
abbr_dict

{}