In [26]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
from nltk import sent_tokenize
from nltk.stem import PorterStemmer
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')
porter = PorterStemmer()

def sent_preprocess(sentence):
    return [porter.stem(word) for word in sentence.lower().split() if word not in stop_words]

def para_preprocess(paragraph):
    return [sent_preprocess(sentence) for sentence in sent_tokenize(paragraph)]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\quan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

In [28]:
import json

with open('../cleaned_data/splm_cleaned.json') as f:
    data = json.load(f)

descriptions = [(str(d['overview']) + str(d['uses'])).lower() for d in data]

processed_descriptions = [para_preprocess(d) for d in descriptions]

In [29]:
from gensim.similarities import WmdSimilarity

wmd_instances = []
# Process all the descriptions and store the WMD instances.
for desc in processed_descriptions:
    wmd_instances.append(WmdSimilarity(desc, model, num_best=4))

In [30]:
query = 'Treatment of acne itchy skin'
query = sent_preprocess(query)
threshold = 0.43

In [31]:
import numpy as np

avg_sims = []
for instance in wmd_instances:
    sims = instance.get_similarities(query)
    sims = sims[sims > threshold]
    if len(sims) > 0:
        avg_sims.append(np.mean(sims))
    else:
        avg_sims.append(0)

sorted_indices = np.argsort(avg_sims)[::-1]

# import numpy as np
# import multiprocessing

# # Define a function to calculate average similarities for a single instance
# def calculate_avg_sim(instance, query, threshold):
#     sims = instance.get_similarities(query)
#     sims = sims[sims > threshold]
#     if len(sims) > 0:
#         return np.mean(sims)
#     else:
#         return 0

# num_cores = multiprocessing.cpu_count() - 1
# num_batch = len(wmd_instances) // num_cores
# instance_chunks = [wmd_instances[i:i + num_batch] for i in range(0, len(wmd_instances), num_batch)]

# # Create a pool of processes
# pool = multiprocessing.Pool(processes=num_cores)
# results = pool.starmap(calculate_avg_sim, [(chunk, query, threshold) for chunk in instance_chunks])
# pool.close()
# pool.join()

# avg_sims = [sim for chunk in results for sim in chunk]

# sorted_indices = np.argsort(avg_sims)[::-1]

In [32]:
# write the results to a file
with open('results.json', 'w') as f:
    json.dump([{'description': data[i], 'similarity': avg_sims[i]} for i in sorted_indices], f, indent=4)