In [1]:
!pip install rank_bm25
!pip install stop_words

import pandas as pd
import nltk
import re
from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('wordnet')

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stop_words
  Building wheel for stop_words (setup.py) ... [?25l[?25hdone
  Created wheel for stop_words: filename=stop_words-2018.7.23-py3-none-any.whl size=32894 sha256=0c9de0a0f9435d23c460dc12d66782eb9fd1010af28eb86559294f89d307b7c5
  Stored in directory: /root/.cache/pip/wheels/8f/a5/51/a5405e1da5d178491b79d12cc81b6cb9bb14fe2c8c632eba70
Successfully built stop_words
Installing collected packages: stop_words
Successfully installed stop_words-2018.7.23


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# get ADR lexicon dataframe
adr_lexicon = pd.read_csv('ADR_lexicon.txt', sep='\t', names=['id', 'reaction', 'source'])
lexicon_list = adr_lexicon.reaction.to_list()

FileNotFoundError: [Errno 2] No such file or directory: 'ADR_lexicon.txt'

In [None]:
# ger reviews daatframe
reviews_df = pd.read_csv('combined_df_1.csv')
reviews_list = reviews_df.text.to_list()
reviews_id = reviews_df.txt_id.to_list()

# 2388 reviews in total

In [None]:
# get list of nan reviews
d = dict(zip(reviews_id,reviews_list))
list_nan = [key for key, value in d.items() if isinstance(value, float)]

In [None]:
# remove nan reviews from review dataframe
reviews_df = reviews_df[~reviews_df['txt_id'].isin(list_nan)]
reviews_list = reviews_df.text.to_list()
reviews_id = reviews_df.txt_id.to_list()

# 2254 after removing nan

In [None]:
# remove nan reviews from adr dataframe
adr_df = pd.read_csv('combined_df_2.csv')
adr_df = adr_df[~adr_df['txt_id'].isin(list_nan)]

In [None]:
# build initial ADR dictionary
ADRs = {}
for i in reviews_id:
    ADRs[i] = adr_df.loc[adr_df['txt_id'] == i]['symptom'].to_list()

In [None]:
# get review IDs that are in review dataframe but not in adr dataframe
no_adr_reviews = [k for k, v in ADRs.items() if v in (None, "", [])]

In [None]:
# remove them
reviews_df = reviews_df[~reviews_df['txt_id'].isin(no_adr_reviews)]
reviews_list = reviews_df.text.to_list()
reviews_id = reviews_df.txt_id.to_list()

# 2058 after removing the not annotated ones

adr_df = adr_df[~adr_df['txt_id'].isin(no_adr_reviews)]

In [None]:
# build final ADR dictionary
ADRs = {}
for i in reviews_id:
    ADRs[i] = adr_df.loc[adr_df['txt_id'] == i]['symptom'].to_list()

In [None]:
def preprocessing(content, remove_sw):
    # convert the text to lowercase
    content = content.lower()

    # remove non-alphabetical characters
    regex = re.compile('[^a-z\s]+')
    content = regex.sub('', content)

    # https://www.adamsmith.haus/python/answers/how-to-remove-all-punctuation-marks-with-nltk-in-python
    # remove punctuation and tokenize (which will be the same as 1-grams)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    one_grams = tokenizer.tokenize(content)

    #remove stopwords
    if remove_sw == True:
        one_grams = [i for i in one_grams if i not in get_stop_words('english')]

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    words = []
    for word in one_grams:
        words.append(lemmatizer.lemmatize(word))

    return words

In [None]:
def get_scores(model, lexicon_list=lexicon_list, reviews_list=reviews_list,  threshold_bm25=0.78, threshold_tf_idf=0.32):

    preprocessed_ADRs = [preprocessing(i, remove_sw=True) for i in lexicon_list]
    preprocessed_reviews = [preprocessing(i, remove_sw=True) for i in reviews_list]
    total_no_adrs = len(lexicon_list)

    # choose model
    if model == 'bm25':
        bm25 = BM25Okapi(preprocessed_ADRs)

        precision_list = []
        recall_list = []
        f1_score_list = []
        accuracy_list = []
        no_retrieved_adrs = []

        for i, j in zip(range(len(preprocessed_reviews)), reviews_id):

            # get actual ADRs
            actual_ADRs = ADRs[j]

            #get the scores for every ADRs for this specific review
            score_list = bm25.get_scores(preprocessed_reviews[i])

            # build dataframe with scores
            scores_df = pd.DataFrame({'ADR': lexicon_list, 'score': score_list})

            # remove rows with score=0
            scores_df = scores_df[scores_df['score'] != 0]

            # normalize scores
            scores_df['normalized_score'] = (scores_df['score'] - scores_df.score.min()) / (scores_df.score.max() - scores_df.score.min())

            # keep only scores over thershold
            scores_df = scores_df[scores_df['normalized_score'] > threshold_bm25]
            no_retrieved_adrs.append(len(scores_df))

            # get final list of ADRs obtained from model
            model_ADRs = scores_df.ADR.to_list()

            # compute metrics
            TP = 0
            FP = 0
            for i in model_ADRs:
                if i in actual_ADRs: TP += 1
                else: FP += 1

            FN = len(actual_ADRs) - TP

            no_actual = [i for i in lexicon_list if i not in actual_ADRs]
            TN_list = [i for i in no_actual if i not in model_ADRs]
            TN = len(TN_list)

            recall = TP / (TP + FN)

            if TP + FP != 0: precision = TP / (TP + FP)
            else: precision = 0

            if TP != 0: f1_score = 2 * ((precision * recall) / (precision + recall))
            else: f1_score = 0

            accuracy = (TP + TN) / total_no_adrs

            precision_list.append(precision)
            recall_list.append(recall)
            f1_score_list.append(f1_score)
            accuracy_list.append(accuracy)

        # buid results dataframe
        results_df = pd.DataFrame({'review_id': reviews_id, 'precision': precision_list, 'recall': recall_list,
                                  'f1_score': f1_score_list, 'accuracy': accuracy_list})

        precision = sum(results_df.precision.to_list()) / len(results_df.precision.to_list())
        recall = sum(results_df.recall.to_list()) / len(results_df.recall.to_list())
        f1 = sum(results_df.f1_score.to_list()) / len(results_df.f1_score.to_list())
        accuracy = sum(results_df.accuracy.to_list()) / len(results_df.accuracy.to_list())
        print('Number of ADRs retrieved for each review', no_retrieved_adrs)
        print('Average number of ADRs retrieved:', sum(no_retrieved_adrs)/len(no_retrieved_adrs))

        # return results
        return 'BM25 results: ', 'precision:', precision, 'recall:', recall, 'f1-score:', f1, 'accuracy', accuracy

    elif model == 'tf_idf':
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(lexicon_list)

        precision_list = []
        recall_list = []
        f1_score_list = []
        accuracy_list = []
        no_retrieved_adrs = []

        for i, j in zip(range(len(reviews_list)), reviews_id):

            # get actual ADRs
            actual_ADRs = ADRs[j]

            query_tfidf = vectorizer.transform([reviews_list[i]])

            # get scores
            cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

            # build dataframe with scores
            scores_df = pd.DataFrame({'ADR': lexicon_list, 'score': cosine_similarities})

            # keep only scores over thershold
            scores_df = scores_df[scores_df['score'] > threshold_tf_idf]
            no_retrieved_adrs.append(len(scores_df))

            # get final list of ADRs obtained from model
            model_ADRs = scores_df.ADR.to_list()

            # compute metrics
            TP = 0
            FP = 0
            for i in model_ADRs:
                if i in actual_ADRs: TP += 1
                else: FP += 1

            FN = len(actual_ADRs) - TP

            no_actual = [i for i in lexicon_list if i not in actual_ADRs]
            TN_list = [i for i in no_actual if i not in model_ADRs]
            TN = len(TN_list)

            recall = TP / (TP + FN)

            if TP + FP != 0: precision = TP / (TP + FP)
            else: precision = 0

            if TP != 0: f1_score = 2 * ((precision * recall) / (precision + recall))
            else: f1_score = 0

            accuracy = (TP + TN) / total_no_adrs

            precision_list.append(precision)
            recall_list.append(recall)
            f1_score_list.append(f1_score)
            accuracy_list.append(accuracy)

        # buid results dataframe
        results_df = pd.DataFrame({'review_id': reviews_id, 'precision': precision_list, 'recall': recall_list,
                                  'f1_score': f1_score_list, 'accuracy': accuracy_list})

        precision = sum(results_df.precision.to_list()) / len(results_df.precision.to_list())
        recall = sum(results_df.recall.to_list()) / len(results_df.recall.to_list())
        f1 = sum(results_df.f1_score.to_list()) / len(results_df.f1_score.to_list())
        accuracy = sum(results_df.accuracy.to_list()) / len(results_df.accuracy.to_list())
        print('Number of ADRs retrieved for each review', no_retrieved_adrs)
        print('Average number of ADRs retrieved:', sum(no_retrieved_adrs)/len(no_retrieved_adrs))

        # return results
        return 'TF-IDF results: ', 'precision:', precision, 'recall:', recall, 'f1-score:', f1, 'accuracy', accuracy

In [None]:
print(get_scores('tf_idf'))

Number of ADRs retrieved for each review [3, 1, 1, 13, 3, 6, 7, 2, 5, 215, 0, 2, 3, 2, 7, 5, 1, 26, 1, 19, 8, 0, 10, 3, 3, 14, 15, 1, 18, 3, 2, 1, 3, 40, 0, 1, 1, 4, 7, 29, 0, 15, 24, 16, 26, 1, 5, 0, 0, 3, 38, 1, 6, 4, 2, 0, 1, 0, 2, 0, 2, 25, 11, 0, 6, 0, 0, 0, 0, 0, 36, 1, 0, 3, 2, 5, 6, 0, 22, 18, 4, 1, 2, 15, 0, 0, 20, 3, 0, 1, 8, 5, 4, 1, 2, 2, 12, 0, 21, 4, 3, 2, 5, 14, 0, 2, 2, 0, 3, 2, 2, 1, 0, 2, 0, 8, 1, 2, 10, 3, 2, 0, 1, 3, 14, 1, 4, 2, 1, 2, 2, 6, 3, 7, 0, 0, 0, 0, 17, 0, 6, 0, 19, 2, 0, 17, 3, 1, 6, 6, 0, 1, 0, 2, 2, 1, 29, 3, 18, 0, 13, 1, 0, 69, 2, 1, 4, 5, 1, 0, 1, 54, 1, 5, 2, 13, 4, 0, 5, 2, 4, 1, 2, 5, 0, 3, 4, 0, 26, 13, 1, 19, 5, 0, 2, 2, 2, 12, 3, 0, 37, 6, 4, 3, 8, 2, 1, 7, 11, 2, 0, 10, 1, 28, 1, 2, 2, 0, 0, 0, 0, 15, 3, 0, 2, 3, 0, 15, 2, 4, 0, 5, 19, 12, 0, 4, 4, 0, 1, 1, 3, 3, 0, 0, 13, 2, 35, 3, 0, 5, 14, 0, 3, 0, 2, 14, 13, 0, 1, 1, 2, 2, 3, 2, 0, 27, 1, 2, 7, 13, 2, 0, 1, 0, 3, 4, 2, 3, 1, 19, 1, 3, 2, 0, 17, 1, 21, 3, 2, 6, 0, 16, 2, 2, 26, 2, 5, 2, 0, 

In [None]:
print(get_scores('bm25'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scores_df['normalized_score'] = (scores_df['score'] - scores_df.score.min()) / (scores_df.score.max() - scores_df.score.min())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scores_df['normalized_score'] = (scores_df['score'] - scores_df.score.min()) / (scores_df.score.max() - scores_df.score.min())


Number of ADRs retrieved for each review [6, 5, 34, 4, 3, 2, 4, 2, 2, 2, 2, 1, 3, 9, 20, 2, 3, 1, 7, 2, 3, 2, 11, 1, 9, 16, 1, 9, 1, 15, 1, 8, 3, 2, 1, 0, 1, 7, 5, 12, 21, 1, 2, 10, 1, 5, 2, 10, 12, 5, 3, 4, 5, 26, 2, 2, 37, 8, 7, 19, 3, 8, 6, 1, 6, 6, 12, 2, 8, 10, 1, 3, 1, 3, 5, 5, 5, 5, 4, 1, 1, 3, 7, 2, 12, 6, 41, 5, 9, 3, 4, 2, 4, 2, 6, 2, 11, 3, 4, 2, 1, 3, 21, 6, 6, 1, 22, 6, 5, 3, 1, 6, 3, 10, 5, 9, 3, 4, 2, 4, 3, 1, 9, 2, 8, 2, 5, 4, 1, 5, 5, 2, 2, 2, 1, 10, 15, 22, 1, 1, 3, 13, 12, 5, 1, 4, 2, 3, 2, 2, 5, 2, 7, 1, 9, 2, 6, 1, 9, 2, 3, 2, 5, 2, 1, 2, 5, 3, 3, 2, 6, 4, 8, 5, 14, 1, 1, 12, 3, 6, 76, 2, 8, 2, 3, 3, 2, 6, 2, 2, 1, 11, 11, 10, 33, 2, 5, 9, 7, 28, 1, 7, 16, 4, 4, 7, 12, 2, 6, 4, 10, 6, 3, 7, 1, 2, 2, 3, 8, 3, 2, 7, 4, 1, 3, 3, 6, 2, 1, 2, 0, 2, 3, 2, 6, 3, 1, 7, 4, 4, 8, 3, 5, 4, 2, 1, 6, 3, 9, 5, 3, 5, 3, 7, 1, 10, 10, 5, 0, 3, 6, 1, 3, 6, 5, 4, 1, 24, 40, 1, 4, 1, 1, 3, 3, 4, 1, 15, 3, 2, 5, 1, 1, 5, 1, 10, 7, 3, 6, 3, 6, 6, 2, 7, 3, 2, 2, 5, 3, 1, 35, 2, 10, 3, 1

In [None]:
# pass the model you want to use ('tf_idf' or 'bm25') and a review. The function outputs the ADRs it retrieves for the review you inputed.

In [None]:
def get_ADRs_for_new_review(model, review, lexicon_list=lexicon_list, threshold_bm25=0.78, threshold_tf_idf=0.32):

    preprocessed_ADRs = [preprocessing(i, remove_sw=True) for i in lexicon_list]
    tokenized_review = preprocessing(review, remove_sw=True)

    # choose model
    if model == 'bm25':
        bm25 = BM25Okapi(preprocessed_ADRs)

        # get the scores for every ADRs for this specific review
        score_list = bm25.get_scores(tokenized_review)

        # build dataframe with scores
        scores_df = pd.DataFrame({'ADR': lexicon_list, 'score': score_list})

        # remove rows with score=0
        scores_df = scores_df[scores_df['score'] != 0]

        # normalize scores
        scores_df['normalized_score'] = (scores_df['score'] - scores_df.score.min()) / (scores_df.score.max() - scores_df.score.min())

        # keep only scores over thershold
        scores_df = scores_df[scores_df['normalized_score'] > threshold_bm25]

        # get final list of ADRs obtained from model
        model_ADRs = scores_df.ADR.to_list()

    elif model == 'tf_idf':
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(lexicon_list)

        query_tfidf = vectorizer.transform([review])

        # get scores
        cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

        # build dataframe with scores
        scores_df = pd.DataFrame({'ADR': lexicon_list, 'score': cosine_similarities})

        # keep only scores over thershold
        scores_df = scores_df[scores_df['score'] > threshold_tf_idf]

        # get final list of ADRs obtained from model
        model_ADRs = scores_df.ADR.to_list()

    return model_ADRs

In [None]:
print(get_ADRs_for_new_review('tf_idf','i was suffering from insomnia before starting taking some medicine. I did not feel any dizziness in the morning as reported in the leaflet, but sometimes i have been feeling nauseous when i wake up.'))

['groggy in the morning']


In [None]:
print(get_ADRs_for_new_review('bm25', 'i was suffering from insomnia before starting taking some medicine. I did not feel any dizziness in the morning as reported in the leaflet, but sometimes i have been feeling nauseous when i wake up.'))

['nauseous']
