In [1]:
import pandas as pd
import numpy as np
import re
import string
from tqdm import tqdm

data = pd.read_csv("../csv/merged_data.csv")

In [2]:
# creating the target feature using ratings 
# here 1 represents positive and 0 - represents negative
data['review_sentiment'] = data['rating'].apply(lambda x: 1 if x > 5 else 0)

In [3]:
data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,1
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,0
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1


In [4]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def preprocess_text(text_data):

    text_data = decontracted(text_data)
    
    text_data = text_data.replace('\n',' ')
    text_data = text_data.replace('\r',' ')
    text_data = text_data.replace('\t',' ')
    text_data = text_data.replace('-',' ')
    text_data = text_data.replace("/",' ')
    text_data = text_data.replace(">",' ')
    text_data = text_data.replace('"',' ')
    text_data = text_data.replace('?',' ')
    return text_data
  
# loading stop words from nltk library

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

#removing 'no' from the stop words list as there is an importance of 'side effects' and 'no side effects' in review
stop_words.remove('no')

def nlp_preprocessing(review):
    '''This functional block preprocess the text data by removing digits, extra spaces, stop words 
    and converting words to lower case and stemming words'''
    
    if type(review) is not int:
        string = ""
        review = preprocess_text(review)
        review = re.sub('[^a-zA-Z]', ' ', review)
        review = re.sub('\s+',' ', review)
        review = review.lower()
        for word in review.split():
            if not word in stop_words:
                word = stemmer.stem(word)
                string += word + " "
        return string 
      
data['cleaned_review'] = data['review'].apply(nlp_preprocessing)

In [5]:
data['drugName'] = data['drugName'].apply(lambda x:x.lower()) # converting to lower case
data['condition'] = data['condition'].apply(lambda x:x.lower())

In [7]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sent = SentimentIntensityAnalyzer()
data["sentiment_score"] = [sent.polarity_scores(r)['compound'] for r in data['review']]
data['sentiment_score_clean'] = [sent.polarity_scores(r)['compound'] for r in data['cleaned_review']]

In [8]:
# Adding the year as feature 
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year

#Word count in each review
data['word_count']=data["cleaned_review"].apply(lambda x: len(str(x).split()))

#Unique word count 
data['unique_word_count']=data["cleaned_review"].apply(lambda x: len(set(str(x).split())))

#character count
data['char_length']=data["cleaned_review"].apply(lambda x: len(str(x)))

#punctuation count
data["count_punctuations"] = data["review"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

#Number of stopwords
data["stopword_count"] = data["review"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))

#Average length of the words
data["mean_word_len"] = data["cleaned_review"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))


  data['date'] = pd.to_datetime(data['date'])
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [9]:
print(data.shape)
data.head()

(212698, 18)


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,sentiment_score_clean,year,word_count,unique_word_count,char_length,count_punctuations,stopword_count,mean_word_len
0,206461,valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,1,no side effect take combin bystol mg fish oil,-0.296,-0.296,2012,9,9,46,3,6,4.111111
1,95260,guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,1,son halfway fourth week intuniv becam concern ...,0.8603,0.6929,2010,65,54,372,23,69,4.723077
2,92703,lybrel,birth control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17,0,use take anoth oral contracept pill cycl happi...,0.7962,0.2732,2009,70,49,403,34,58,4.757143
3,138000,ortho evra,birth control,"""This is my first time using any form of birth...",8,2015-11-03,10,1,first time use form birth control glad went pa...,0.7184,0.1027,2015,39,26,226,15,45,4.794872
4,35696,buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37,1,suboxon complet turn life around feel healthie...,0.9403,0.8934,2016,59,52,381,28,60,5.457627


#### Extracing the subject and object count for each review

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")
def subj_obj_count(review):

    sen = review
    doc=nlp(sen)
    sub_words = set([str(word) for word in doc if (word.dep_ == "nsubj")])

    obj_words = set([str(word) for word in doc if (word.dep_ == "dobj")])

    return len(sub_words),len(obj_words)

# count = []

# for r in data['review']:
#     count.append(subj_obj_count(r))

In [14]:
# sub_obj = pd.DataFrame(count,columns=['subj_count','obj_count'])
# sub_obj.head()

In [15]:
# sub_obj.to_csv('sub_obj.csv',index=False)
sub_obj = pd.read_csv('../csv/sub_obj.csv')
print(sub_obj.shape)
sub_obj.head()

(212698, 2)


Unnamed: 0,subj_count,obj_count
0,2,2
1,9,5
2,10,8
3,8,5
4,8,10


#### Named Entity Recognition

In [16]:
nlp = spacy.load("en_core_web_sm")
ner_lst = nlp.pipe_labels['ner']

def ner(review):
	r = review
	doc = nlp(r)
	dic = {}.fromkeys(ner_lst,0)
	for word in doc.ents:
		dic[word.label_]+=1
	return dic
# entity = [ner(r) for r in data['cleaned_review']]
# entity = pd.DataFrame(entity)

In [18]:
# entity.to_csv('entities.csv',index=False)
entity = pd.read_csv('../csv/entities.csv')
print(entity.shape)
entity.head()

(212698, 18)


Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,4,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0
3,0,2,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0


### Topic Modelling
Analyses text data and finds the topics that best describes the set of documents.
#### Latent Dirichlet Allocation (LDA) 
A popular form of statistical topic modelling. 
It is used to classify the text in a document to a particular topic. The words are allocated to the topic with their distribution using Dirichlet distribution.

In [19]:
# import gensim
# # preprocess corpus for unigram in a cleaned reviews
# corpus = data['cleaned_review']
# lst_corpus=[]
# for string in tqdm(corpus):
# 	lst_words = string.split()
# 	lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0, len(lst_words),1)]
# 	lst_corpus.append(lst_grams)

# # lst_corpus is the list of documents. Each document in lst_corpus is a list of words from a review.
# lst_corpus

In [20]:
# dictionary maps every unique word in the corpus to a unique integer ID and its frequency in the document. This is a common format for text data in many natural language processing tasks.
# id2word = gensim.corpora.Dictionary(lst_corpus)
# id2word

In [21]:
# dic_corpus = [id2word.doc2bow(word) for word in lst_corpus]
# dic_corpus

The `LDA model` is a generative statistical model that allows sets of observations to be explained by unobserved groups. In the case of text data, these `unobserved groups are topics`, and the `observations are words`. <br><br>
The LDA model assumes that each document in the corpus is a mixture of a small number of topics and that each word in the document is attributable to one of the document's topics. The model learns these topics and their distribution over the documents in the corpus.

In [22]:
# train a Latent Dirichlet Allocation (LDA) model on this 
# lda_model = gensim.models.LdaModel(corpus = dic_corpus, id2word=id2word)
# lda_model

`get_document_topics` method of the `gensim` librarys LDA model to get the topic distribution for a specific document in the corpus.<br><br>

The `get_document_topics` method takes a document in bag-of-words (BoW) format and returns a list of tuples, where each tuple contains a topics ID and the probability that was assigned to it. The document is represented by `dic_corpus[i]`, where `[i]` is the index of the document in the `dic_corpus` list.
<br><br>

The `minimum_probability` parameter is set to 0.0. This means that all topics will be included in the output, even those with a very low assigned probability.

In [23]:

# train_vecs = []
# for i in range(len(corpus)):
# 	top_topics = (lda_model.get_document_topics(dic_corpus[i],minimum_probability=0.0))

# 	topic_vec = [top_topics[i][1] for i in range(20)]

# 	train_vecs.append(topic_vec)

# topics = pd.DataFrame(train_vecs)

In [25]:
# topics.to_csv('topics.csv',index=False)
topics = pd.read_csv('../csv/topics.csv')
# print("\nBefore modifying first column:\n", topics.columns) 
# topics.rename(columns = {i:str(i) for i in range(0,topics.shape[1])}, inplace = True) 
# print("\nAfter modifying first column:\n", topics.columns) 
print(topics.shape)
topics.head()

(212698, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
1,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.000154,0.036646,0.000154,0.000154,0.000154,0.000154
2,0.000143,0.000143,0.000143,0.000143,0.000143,0.029631,0.000143,0.000143,0.037755,0.000143,0.000143,0.000143,0.000143,0.000143,0.000143,0.000143,0.000143,0.000143,0.014679,0.000143
3,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.027342,0.00025,0.00025,0.00025
4,0.000172,0.000172,0.000172,0.000172,0.000172,0.017766,0.000172,0.000172,0.019721,0.000172,0.000172,0.000172,0.000172,0.000172,0.023106,0.000172,0.000172,0.000172,0.000172,0.000172


In [26]:
# Now combining the features extracted above - subject object count,named entity recognition,topic modelling vectors for each of the review.
data = pd.concat([data,sub_obj,entity,topics],axis=1)
print(data.shape)


(212698, 58)


In [27]:
data.columns

Index(['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date',
       'usefulCount', 'review_sentiment', 'cleaned_review', 'sentiment_score',
       'sentiment_score_clean', 'year', 'word_count', 'unique_word_count',
       'char_length', 'count_punctuations', 'stopword_count', 'mean_word_len',
       'subj_count', 'obj_count', 'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE',
       'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT',
       'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART', '0', '1', '2',
       '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15',
       '16', '17', '18', '19'],
      dtype='object')

In [28]:
data.isna().any().sum()

1

In [29]:
data = data.dropna(axis = 0)

In [30]:
print(data.shape)
data.head()

(212690, 58)


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,...,10,11,12,13,14,15,16,17,18,19
0,206461,valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,1,no side effect take combin bystol mg fish oil,-0.296,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
1,95260,guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,1,son halfway fourth week intuniv becam concern ...,0.8603,...,0.000154,0.000154,0.000154,0.000154,0.000154,0.036646,0.000154,0.000154,0.000154,0.000154
2,92703,lybrel,birth control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17,0,use take anoth oral contracept pill cycl happi...,0.7962,...,0.000143,0.000143,0.000143,0.000143,0.000143,0.000143,0.000143,0.000143,0.014679,0.000143
3,138000,ortho evra,birth control,"""This is my first time using any form of birth...",8,2015-11-03,10,1,first time use form birth control glad went pa...,0.7184,...,0.00025,0.00025,0.00025,0.00025,0.00025,0.00025,0.027342,0.00025,0.00025,0.00025
4,35696,buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37,1,suboxon complet turn life around feel healthie...,0.9403,...,0.000172,0.000172,0.000172,0.000172,0.023106,0.000172,0.000172,0.000172,0.000172,0.000172


In [31]:
data.to_csv('../csv/final_data.csv',index=False)