In [1]:
# Importing all necessary modules
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [4]:
# Read the data from json file
with open('splm_cleaned.json') as f:
    data = json.load(f)

descriptions = [str(d['overview']).lower() for d in data]

def tokenize_lower_and_remove_punctuation(text):
    tokens = word_tokenize(text, preserve_line=False)
    return [token.lower() for token in tokens if token.isalnum()]

# Step 1: Preprocess and vectorize the description paragraphs
# descriptions = [
#                 'Skip-Gram predicts context words from a target word, and CBOW predicts a target word based on its context. While Skip-Gram frequently performs better for infrequent words, CBOW is faster and typically performs better with frequent words.',
#                 'Using a large corpus, word embeddings are trained by modifying vector representations in response to how well the model predicts target or context words.',
#                 'The similarity between two vectors in an inner product space is measured by cosine similarity. It finds whether two vectors are roughly pointing in the same direction by measuring the cosine of the angle between them. In text analysis, it is frequently used to gauge document similarity.'
#             ]

# Apply lowercase and whitespace split only
# descriptions = [d.lower().split() for d in descriptions]
descriptions = [tokenize_lower_and_remove_punctuation(d) for d in descriptions]

# Remove stopwords
# stop_words = set(stopwords.words('english'))
# descriptions = [[token for token in d if token not in stop_words] for d in descriptions]


# print(descriptions)

In [5]:
# Train a Word2Vec model
model = Word2Vec(sentences=descriptions, vector_size=200, window=5, min_count=1, workers=4, sg=1)

# Create a dictionary mapping words to their vector representations
word_vectors = {word: model.wv[word] for word in model.wv.index_to_key}

# Create a vector representation for each description paragraph
description_vectors = [np.mean([word_vectors[word] for word in desc if word in word_vectors], axis=0) for desc in descriptions]

# Step 2: Preprocess and vectorize the input keywords
def process_keywords(keywords):
    # Same preprocessing procedure as that of the descriptions
    return keywords.lower().split()

def vectorize_keywords(keywords):
    return np.mean([word_vectors[word] for word in keywords if word in word_vectors], axis=0)

# Step 3: Compare the vector of the input keywords to the vectors of the description paragraphs using Cosine similarity metric
def find_best_match(keywords_vector):
    similarities = cosine_similarity([keywords_vector], description_vectors)
    best_match_index = np.argmax(similarities)
    return best_match_index

# Step 4: Return the description paragraph that has the highest similarity score
def find_object(keywords):
    keywords = process_keywords(keywords)
    print("Keywords: ", keywords)
    keywords_vector = vectorize_keywords(keywords)
    best_match_index = find_best_match(keywords_vector)
    # print(word_vectors)
    print("Description vector: ", len(description_vectors))
    print("Keywords vector: ", len(keywords_vector))
    return best_match_index # Return the index to the best-matched paragraph

res = find_object("fever cough swelling")

Keywords:  ['fever', 'cough', 'swelling']
Description vector:  1293
Keywords vector:  200


In [6]:
# print the result in json format
print(json.dumps(data[res], indent=4))

{
    "name": "CEREUS",
    "overview": "Cereus is an herb. People use the flower, stem, and young shoots for medicine. Cereus is used for chest pain (angina), fluid retention associated with weak heart function (heart failure), and as a heart stimulant. Cereus is also used for bladder infections and other urinary tract problems, bleeding, and shortness of breath. Women use it for painful or heavy menstrual periods. Cereus is sometimes applied directly to the skin for joint pain.",
    "uses": "Insufficient Evidence for Chest pain (angina). Fluid retention due to heart failure. Heavy menstrual pain and bleeding. Urinary tract problems. Bleeding. Shortness of breath. Joint pain, when applied to the skin. Other conditions. More evidence is needed to rate the effectiveness of cereus for these uses.",
    "side_effects": "Cereus seems safe for most people, when used for conditions other than heart disease. But it\u2019s UNSAFE to use cereus for a heart condition, except under the direct su

In [44]:
model = Word2Vec(sentences=descriptions, vector_size=250, window=5, min_count=1, workers=4, sg=1)
model.save("splm_word2vec.model")

In [47]:
from collections import Counter

# Load the model
model = Word2Vec.load("splm_word2vec.model")

# Top 10 most similar words to "fever"
sims = model.wv.most_similar("fever", topn=10)
print(sims)

# Create a dictionary mapping words to their vector representations
word_vectors = {word: model.wv[word] for word in model.wv.index_to_key}

# print("Number of words in the vocabulary: ", len(word_vectors))

# # print("Description: ", descriptions[0])

# # Vectorize each word in the description paragraphs
description_word_vectors = [[word_vectors[word] for word in word_tokenize(desc) if word in word_vectors] for desc in descriptions] # shape: (num_descriptions, num_words, vector_size)

def vectorize_keywords(keywords):
    keywords = word_tokenize(keywords.lower())
    return [word_vectors[word] for word in keywords if word in word_vectors]

def find_best_match(keywords_vector):
    print(len(keywords_vector), len(description_word_vectors[0]))
    return 0, 0
    # similarities = [cosine_similarity([keywords_vector], [desc_vec])[0][0] for desc_vec in description_word_vectors]
    # best_match_index = np.argmax(similarities)
    # return best_match_index, similarities[best_match_index]

def find_object(keywords):
    keywords_vector = vectorize_keywords(keywords) # shape: (num_keywords, vector_size)
    print("Keywords vector: ", keywords_vector)
    best_match_index, best_similarity = find_best_match(keywords_vector)
    return best_match_index, best_similarity

res, _ = find_object("fever cough")

[('asthma', 0.9841917753219604), ('cough', 0.9793998003005981), ('constipation', 0.9775192737579346), ('diarrhea', 0.9773197770118713), ('arthritis', 0.972391664981842), ('hay', 0.9705657958984375), ('gallbladder', 0.9704510569572449), ('colds', 0.9701707363128662), ('migraine', 0.9696513414382935), ('rheumatoid', 0.9676762819290161)]


In [None]:
print(json.dumps(data[res], indent=4))