In [None]:
#mount to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install rank_bm25
!pip install stop_words
!pip install transformers
!pip install sklearn_crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import re
import pickle
import scipy.stats
import numpy as np
import pandas as pd
import joblib 
from itertools import chain
from rank_bm25 import BM25Okapi
from gensim.models import Word2Vec
from stop_words import get_stop_words


import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('wordnet')

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def preprocessing(content, remove_sw = False):

    # convert the text to lowercase
    content = content.lower() 
    regex = re.compile('[^a-z\s]+')

    # remove all commas so that constructions such as $70,000 maintain their meaning and do not get split:'70', '000'
    content = regex.sub('', content)

    # https://www.adamsmith.haus/python/answers/how-to-remove-all-punctuation-marks-with-nltk-in-python
    # remove punctuation and tokenize (which will be the same as 1-grams)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    one_grams = tokenizer.tokenize(content)

    #remove stopwords
    if remove_sw == True:
        one_grams = [i for i in one_grams if i not in get_stop_words('english')]

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    words = []
    for word in one_grams:
        words.append(lemmatizer.lemmatize(word))   

    return words

##**Data Processing**

The unsupervised learning part was completed above, with the unsupervised data from kaggle (https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018) and the combined supervised data sets from different sources. 

In [None]:
# Download all data
unlabeled_reviews_train = pd.read_csv('/content/drive/MyDrive/NLP Project/Data/Unsupervised drug reviews/drugsComTrain_raw.csv')
unlabeled_reviews_test = pd.read_csv('/content/drive/MyDrive/NLP Project/Data/Unsupervised drug reviews/drugsComTest_raw.csv')

labeled_drug_reviews = pd.read_csv("/content/drive/MyDrive/NLP Project/Data/Unsupervised drug reviews/Copy of combined_df_1.csv")

In [None]:
# Concatenate unlabeled reviews
unlabeled_drug_reviews = pd.concat([unlabeled_reviews_train, unlabeled_reviews_test], axis = 0)
unlabeled_drug_reviews.reset_index(drop=True, inplace=True)

In [None]:
# Create lists of reviews for both datasets
unlabeled_reviews_list = unlabeled_drug_reviews.review.to_list() # A lists of lists. Contains characters
labeled_reviews_list = labeled_drug_reviews.text.to_list()

labeled_reviews_list = [x for x in labeled_reviews_list if str(x) != 'nan'] # Get rid of nans

# Combine lists
review_list = unlabeled_reviews_list
review_list.extend(labeled_reviews_list)

print(f"There are {len(review_list)} reviews.")


There are 216311 reviews.


In [None]:
# Tokenize reviews
preprocessed_reviews = [preprocessing(i) for i in review_list]

KeyboardInterrupt: ignored

In [None]:
counter = 0
for review in preprocessed_reviews:
  counter += len(review)

print(counter)

##**Embeddings and clustering**

1. Obtain embeddings using Word2Vec for the whole set of rewiews.
2. Obtain clusters using K-means with 150 clusters, same number as in the paper. 

After, save the models.

In [None]:
# Create Word embedding and clusters. Takes 3 minutes
model = Word2Vec(sentences = preprocessed_reviews, vector_size= 150, min_count=1)

In [None]:
model = Word2Vec.load("/content/drive/MyDrive/NLP Project/Models/word2vec.model")

In [None]:
# Obtain the vector representations of the words. It's a dictionary
word_vectors = model.wv

vocab = np.array(list(model.wv.key_to_index.keys()))
word_vecs = []

for word in vocab:
    word_vecs.append(word_vectors[word])
    
word_array = np.array(word_vecs)

kmeans = KMeans(n_clusters=150).fit(word_array)



In [None]:
# Save the vocab and the word arrays
model.save("/content/drive/MyDrive/NLP Project/Models/word2vec.model")

In [None]:
# Save clustering 
joblib.dump(kmeans, "/content/drive/MyDrive/NLP Project/Models/model.pkl")

['/content/drive/MyDrive/NLP Project/Models/model.pkl']

In [None]:
kmeans = joblib.load("/content/drive/MyDrive/NLP Project/Models/model.pkl")

In [None]:
# Investigate clusterings created
label = 17
mask = (kmeans.labels_ == label)
vocab[mask]

array(['stop', 'start', 'continue', 'become', 'treat', 'kick', 'avoid',
       'begin', 'prevent', 'reduce', 'cure', 'return', 'ease', 'relieve',
       'regulate', 'subside', 'improve', 'kill', 'trick', 'skip', 'lead',
       'bring', 'discontinue', 'reach', 'heal', 'occur', 'develop',
       'disappear', 'curb', 'lessen', 'eliminate', 'settle', 'alleviate',
       'fade', 'worsen', 'resolve', 'suppress'], dtype='<U114')

In [None]:
label = 17
mask = (kmeans.labels_ == label)
vocab[mask]

array(['pill', 'control', 'birth', 'patch', 'bc', 'generic', 'brand',
       'depo', 'yaz', 'nuvaring', 'loestrin', 'fe', 'lo', 'hormonal',
       'sprintec', 'ortho', 'ring', 'placebo', 'yasmin', 'trinessa',
       'contraceptive', 'tri', 'lutera', 'aviane', 'microgestin', 'apri',
       'junel', 'tricyclen', 'seasonique', 'trisprintec', 'beyaz',
       'cyclen', 'gianvi', 'alesse', 'contraception', 'nuva', 'ocella',
       'levora', 'minastrin', 'gildess', 'blisovi', 'evra', 'generess'],
      dtype='<U114')

In [None]:
label = 40
mask = (kmeans.labels_ == label)
vocab[mask]

array(['sleep', 'function', 'walk', 'move', 'breath', 'sit', 'breathe'],
      dtype='<U114')

In [None]:
label = 109
mask = (kmeans.labels_ == label)
vocab[mask]

array(['up', 'out', 'back', 'off', 'down', 'away', 'without', 'through',
       'around', 'into', 'bed', 'point', 'home', 'bathroom', 'rest',
       'toilet', 'outside', 'store', 'anywhere', 'thru', 'urgent',
       'ahead', 'restroom', 'downhill'], dtype='<U114')

##**CRF with Word2Vec and K-means**

In [None]:
%cd drive/MyDrive/NLP Project/Code

[Errno 2] No such file or directory: 'drive/MyDrive/NLP Project/Code'
/content/drive/MyDrive/NLP Project/Code


In [None]:
# Import custom functions
from bert_text_pre_processing import add_labels
from CRF_utils import sent2features

In [None]:
# Load word2Vec and kmeans
model = Word2Vec.load("/content/drive/MyDrive/NLP Project/Models/word2vec.model")
kmeans = joblib.load("/content/drive/MyDrive/NLP Project/Models/model.pkl")

word_vectors = model.wv

vocab = np.array(list(model.wv.key_to_index.keys()))
word_vecs = []

for word in vocab:
    word_vecs.append(word_vectors[word])
    
word_array = np.array(word_vecs)

###**Data processing for CRF**

For each token in each review, create dictionary containing:
1. The three previous and following tokens. 
2. The respective clusters of the aforementioned tokens. 

In [None]:
df_1 = pd.read_csv(r'/content/drive/MyDrive/NLP Project/Data/Combined Datasets/combined_df_1.csv')
df_2 = pd.read_csv(r'/content/drive/MyDrive/NLP Project/Data/Combined Datasets/combined_df_2.csv')

pre_processed = add_labels(df_1, df_2, 'other', 'text', 'txt_id', 'symptom', False)

# split dataset into training and test/val
np.random.seed(100)

train_df, not_train_df = train_test_split(pre_processed, test_size=0.2)
valid_df, test_df = train_test_split(not_train_df, test_size=0.5)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
test_df['text'][0]

'Within 5 minutes of taking drug, developed severe colon and uterine cramping. Cold sweat, fainting, heart palpitations. Will never use again or recommend. '

In [None]:
# Create a list of sentences for each DataFrame. Remove sentences with less than 
# 6 tokens and make sure the labels are strings. 

train_sentences, y_train_CRF = [], []
val_sentences, y_val_CRF = [], []
test_sentences, y_test_CRF = [], []

for sent, lab in zip(train_df.tokenized.to_list(), train_df.pre_processed_tokens.to_list()):
  if len(sent) >= 6 and len(lab) == len(sent):
    train_sentences.append(sent)
    y_train_CRF.append(np.array(lab, dtype = 'str').tolist())

for sent, lab in zip(valid_df.tokenized.to_list(), valid_df.pre_processed_tokens.to_list()):
  if len(sent) >= 6 and len(lab) == len(sent):
    val_sentences.append(sent)
    y_val_CRF.append(np.array(lab, dtype = 'str').tolist())

for sent, lab in zip(test_df.tokenized.to_list(), test_df.pre_processed_tokens.to_list()):
  if len(sent) >= 6 and len(lab) == len(sent):
    test_sentences.append(sent)
    y_test_CRF.append(np.array(lab, dtype = 'str').tolist())

In [None]:
train_counter = 0
val_counter = 0
test_counter = 0

for sent in train_sentences:
  train_counter += len(sent)

for sent in val_sentences:
  val_counter += len(sent)

for sent in test_sentences:
  test_counter += len(sent)

print(f"Train: {len(train_sentences)}, tokens - {train_counter}")
print(f"Val: {len(val_sentences)}, tokens - {val_counter}")
print(f"Test: {len(test_sentences)}, tokens - {test_counter}")

Train: 1606, tokens - 142135
Val: 200, tokens - 18467
Test: 206, tokens - 20658


In [None]:
print(len(train_sentences), len(y_train_CRF))
print(len(val_sentences), len(y_val_CRF))
print(len(test_sentences), len(y_test_CRF))

1606 1606
200 200
206 206


In [None]:
# Get CRF features for the three sets
X_train = [sent2features(s, vocab,  kmeans.labels_) for s in train_sentences]
X_val = [sent2features(s, vocab,  kmeans.labels_) for s in val_sentences]
X_test = [sent2features(s, vocab,  kmeans.labels_) for s in test_sentences]

KeyboardInterrupt: ignored

In [None]:
# Save files
with open("/content/drive/MyDrive/NLP Project/Data/CRF_Data/xTrain", "wb") as f:
    pickle.dump(X_train, f)
with open("/content/drive/MyDrive/NLP Project/Data/CRF_Data/xVal", "wb") as f:
    pickle.dump(X_val, f)
with open("/content/drive/MyDrive/NLP Project/Data/CRF_Data/xTest", "wb") as f:
    pickle.dump(X_test, f)
with open("/content/drive/MyDrive/NLP Project/Data/CRF_Data/yTrain", "wb") as f:
    pickle.dump(y_train_CRF, f)
with open("/content/drive/MyDrive/NLP Project/Data/CRF_Data/yVal", "wb") as f:
    pickle.dump(y_val_CRF, f)
with open("/content/drive/MyDrive/NLP Project/Data/CRF_Data/yTest", "wb") as f:
    pickle.dump(y_test_CRF, f)

In [None]:
# Load files
X_train = pd.read_pickle(r"/content/drive/MyDrive/NLP Project/Data/CRF_Data/xTrain")
X_val = pd.read_pickle(r"/content/drive/MyDrive/NLP Project/Data/CRF_Data/xVal")
X_test = pd.read_pickle(r"/content/drive/MyDrive/NLP Project/Data/CRF_Data/xTest")

y_train_CRF = pd.read_pickle(r"/content/drive/MyDrive/NLP Project/Data/CRF_Data/yTrain")
y_val_CRF = pd.read_pickle(r"/content/drive/MyDrive/NLP Project/Data/CRF_Data/yVal")
y_test_CRF = pd.read_pickle(r"/content/drive/MyDrive/NLP Project/Data/CRF_Data/yTest")


###**Hyperparameter tuning: number of iterations for CRF**

In [None]:
no_iters = [50, 60, 70, 80, 90, 100]

for n in no_iters:

  crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=n, all_possible_transitions=True)
  crf.fit(X_train, y_train_CRF)
  
  y_pred_train = crf.predict(X_train)
  y_pred = crf.predict(X_val[:200])

  F1_val = metrics.flat_f1_score(y_val_CRF, y_pred, pos_label='1')
  F1_train= metrics.flat_f1_score(y_train_CRF, y_pred_train, pos_label='1')

  print(f"For {n} iterations, the train and validation F1-scores are {F1_train:.2f} and {F1_val:.2f}.")

ValueError: ignored

In [None]:
best_no_iters = 70

###**Model. Final results**

In [None]:
# Train CRF
crf = sklearn_crfsuite.CRF(algorithm='lbfgs',c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(X_train, y_train_CRF)

# Print results
y_pred = crf.predict(X_test)

acc = metrics.flat_accuracy_score(y_test_CRF, y_pred)
recall = metrics.flat_recall_score(y_test_CRF, y_pred, pos_label='1')
precision = metrics.flat_precision_score(y_test_CRF, y_pred, pos_label='1')
F1 = metrics.flat_f1_score(y_test_CRF, y_pred, pos_label='1')

print(f"Accuracy - {acc}, Recall - {recall}, Precision - {precision}, F1 - {F1}.")

Accuracy - 0.9018297995933778, Recall - 0.5269102990033223, Precision - 0.7242009132420091, F1 - 0.61.


In [None]:
joblib.dump(crf, "/content/drive/MyDrive/NLP Project/Models/CRF_word2vec")

['/content/drive/MyDrive/NLP Project/Models/CRF_word2vec']

In [None]:
y_pred[0]

['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']