In [10]:
categories = ["Head and Neck", "Respiratory", "Cardiovascular", "Gastrointestinal", "Neurological", "Genitourinary", "Skin", "General"]

import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
import json

# Load medspacy model
nlp = medspacy.load()
print(nlp.pipe_names)

# Load rules from JSON file
with open("symp_rules.json", "r") as f:
    symp_rules_json = json.load(f)

target_rules = []

for category in categories:
    # Create a TargetRule object
    for rule_data in symp_rules_json[category]:
        if 'pattern' in rule_data:
            target_rule = TargetRule(rule_data['phrase'], rule_data['type'], pattern=rule_data['pattern'])
        else:
            target_rule = TargetRule(rule_data['phrase'], rule_data['type'])
    
        # Add the TargetRule object to the list
        target_rules.append(target_rule)

target_matcher = nlp.get_pipe("medspacy_target_matcher")
target_matcher.add(target_rules)

# import splm['details'] + splm['uses'] from splm_cleaned.json
with open("../cleaned_data/splm_cleaned.json", "r") as f:
    splms = json.load(f)

splms = [str(splm['overview']) + str(splm['uses']) for splm in splms]
unqiue_symptoms = set()

# extract symptoms to a file
with open("../output/symptoms.txt", "w") as f:
    # loop through each splm
    for text in splms:
        # process the text using medspacy
        doc = nlp(text)
        # extract symptoms
        for ent in doc.ents:
            if ent.label_ == "SYMPTOM":
                unqiue_symptoms.add(str(ent.text).lower())

# sort symptoms
unqiue_symptoms = sorted(unqiue_symptoms)

# write symptoms to file
with open("../output/symptoms.txt", "w") as f:
    for symptom in unqiue_symptoms:
        f.write(f"{symptom}\n")

['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']


In [None]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained Word2Vec model
word2vec_model = Word2Vec.load("../word2vec/splm_word2vec.model")

# Target combination of words
target_combination = "shortness of breath"

# Tokenize the target combination
target_tokens = target_combination.split()

# Convert tokens to word vectors
target_vectors = [word2vec_model.wv[token] for token in target_tokens]

# Combine vectors of individual tokens (e.g., by averaging)
combined_vector = sum(target_vectors) / len(target_vectors)

# Compute similarity scores with all other vectors in the dataset
similarity_scores = cosine_similarity([combined_vector], word2vec_model.wv.vectors)

# Rank combinations based on similarity scores
similar_combinations_indices = similarity_scores.argsort(axis=1)[:, ::-1]

for index in similar_combinations_indices[0, 1:]:
    print(word2vec_model.wv.index_to_key[index])

# Retrieve top similar combinations
# top_similar_combinations = [word2vec_model.wv.index_to_key[index] for index in similar_combinations_indices[:, 1:]]
