In [None]:
!pip install rank_bm25
!pip install stop_words
import pandas as pd
import nltk
import re
from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stop_words
  Building wheel for stop_words (setup.py) ... [?25l[?25hdone
  Created wheel for stop_words: filename=stop_words-2018.7.23-py3-none-any.whl size=32910 sha256=2e7b67a0b69528b020fac384f83b9678c11d3b2a1cfdb78b07f82a0b3c4aeee6
  Stored in directory: /root/.cache/pip/wheels/da/d8/66/395317506a23a9d1d7de433ad6a7d9e6e16aab48cf028a0f60
Successfully built stop_words
Installing collected packages: stop_words
Successfully installed stop_words-2018.7.23


In [None]:
drug_reviews_train = pd.read_csv('drugsComTrain_raw.csv')
drug_reviews_test = pd.read_csv('drugsComTest_raw.csv')
drug_reviews = pd.concat([drug_reviews_train, drug_reviews_test], axis = 0)
drug_reviews.reset_index(drop=True, inplace=True)

reviews_list = drug_reviews.review.to_list()
reviews_id = drug_reviews. uniqueID.to_list()

adr_lexicon = pd.read_csv('ADR_lexicon.txt', sep='\t', header=None)
adr_lexicon.columns = ["UMLS_ID", "condition", "source"]
adr_list =  adr_lexicon.condition.to_list()
adr_id = adr_lexicon.UMLS_ID.to_list()

In [None]:
def preprocessing(content, remove_sw):

    # convert the text to lowercase
    content = content.lower()
    regex = re.compile('[^a-z\s]+')

    # remove all commas so that constructions such as $70,000 maintain their meaning and do not get split:'70', '000'
    content = regex.sub('', content)

    # https://www.adamsmith.haus/python/answers/how-to-remove-all-punctuation-marks-with-nltk-in-python
    # remove punctuation and tokenize (which will be the same as 1-grams)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    one_grams = tokenizer.tokenize(content)

    #remove stopwords
    if remove_sw == True:
        one_grams = [i for i in one_grams if i not in get_stop_words('english')]

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    words = []
    for word in one_grams:
        words.append(lemmatizer.lemmatize(word))   

    return words

In [None]:
preprocessed_adr = [preprocessing(i, remove_sw=True) for i in adr_list]
preprocessed_reviews = [preprocessing(i, remove_sw=True) for i in reviews_list]

ADRs = dict(zip(adr_id, preprocessed_adr))
reviews = dict(zip(reviews_id, preprocessed_reviews))

In [None]:
#BM25
bm25 = BM25Okapi(preprocessed_adr)
# print(bm25.get_scores(preprocessed_reviews[36]))
print(bm25.get_top_n(preprocessed_reviews[6], adr_list, n=5))

['positive pregnancy tests', 'negative pregnancy test', 'pregnancy test false positive', 'false positive pregnancy test', 'ectromelia two limbs']


In [None]:
#TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(adr_list)

# transform the query using the vectorizer
query_tfidf = vectorizer.transform([reviews_list[6]])

# calculate the cosine similarity between the query and the documents
cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

# create a list of document indices and their corresponding cosine similarities
document_scores = [(i, score) for i, score in enumerate(cosine_similarities)]

# sort the list by descending cosine similarity scores
document_scores = sorted(document_scores, key=lambda x: x[1], reverse=True)

# print the ranked documents
count = 0
for i, score in document_scores:
    print(f"ADR {i+1}: {preprocessed_adr[i]}, Score: {score:.2f}")
    count += 1
    if count == 5:
        break

ADR 7446: ['negative', 'pregnancy', 'test'], Score: 0.37
ADR 12985: ['haemorrhage', 'pregnancy'], Score: 0.36
ADR 751: ['pregnancy'], Score: 0.35
ADR 12556: ['blackout'], Score: 0.33
ADR 313: ['pregnancy', 'test', 'false', 'positive'], Score: 0.33
