<center><h1>Drug Review Using Sentiment Analysis</h1></center>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- In this project, we aim to perform Sentiment Analysis of Drug reviews. Data used in this project are
  online product reviews collected from “amazon.com”. We expect to do review-level categorization of review 
  data with promising outcomes.

- Sentiment Analysis also known as Opinion Mining refers to the use of natural language processing, 
  text analysis to systematically identify, extract, quantify, 
  and study affective states and subjective information.

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import spacy
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings("ignore")

## importing Dataset

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/drugsComTrain_raw.tsv',delimiter='\t')
test_df = pd.read_csv('/content/drive/MyDrive/drugsComTest_raw.tsv', delimiter='\t')

To simplify Data Understanding and Data Preprocessing we can merge train and test data, 
as there are no target labels.

## Merge Train and Test data

In [None]:
# Merge train and test data
merge = [train_df, test_df]
df = pd.concat(merge, ignore_index = True)

In [None]:
# Check the shape of merged data
df.shape

(215063, 7)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [None]:
class dframe_Preprocessor():

    
    def __init__(self,n_rare_words):
        self.n_rare_words = 10
        
        
        print("Preprocessor object created")
        

    def __remove_punctuation(self,text):
        
        PUNCT_TO_REMOVE = string.punctuation
        """custom function to remove the punctuation"""
        return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

    def __remove_stopwords(self,text):
        
        STOPWORDS = set(stopwords.words('english'))
        """custom function to remove the stopwords"""
        return " ".join([word for word in str(text).split() if word not in STOPWORDS])

    def Get_Most_Commom(self,data):
        
        cnt = Counter()
        for text in df["review"].values:
            for word in text.split():
                cnt[word] += 1

        return cnt.most_common(10)
    def __remove_freqwords(self,text):
        
        FREQWORDS = set([w for (w, wc) in count])
        """custom function to remove the frequent words"""
        return " ".join([word for word in str(text).split() if word not in FREQWORDS])
    
    def __remove_rarewords(self,text):
        
        RAREWORDS = set([w for (w, wc) in count[:-self.n_rare_words-1:-1]])
        """custom function to remove the rare words"""
        return " ".join([word for word in str(text).split() if word not in RAREWORDS])
    def __stem_words(self,text):
        
        stemmer = PorterStemmer()
        return " ".join([stemmer.stem(word) for word in text.split()])


    def Text_Preprocessing(self,data):
        
        try:
            
            data = data[['review','rating']]
            data["review"] = data["review"].apply(lambda text: self.__remove_punctuation(text))
            data["review"] = data["review"].apply(lambda text: self.__remove_stopwords(text))
            data["review"] = data["review"].apply(lambda text: self.__remove_freqwords(text))
            data["review"] = data["review"].apply(lambda text: self.__remove_rarewords(text))
            data["review"] = data["review"].apply(lambda text: self.__stem_words(text))
            data = data.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
            data['review'] = data['review'].str.replace('\d+', '')
            return data
        
        except ValueError as ve:
            raise(ValueError("Error in Text Preprocessing {}".format(ve)))


In [None]:
preprocess = dframe_Preprocessor(10)

Preprocessor object created


In [None]:
count = preprocess.Get_Most_Commom(df)
count

[('I', 883057),
 ('and', 563694),
 ('the', 484954),
 ('to', 425525),
 ('a', 376651),
 ('my', 330282),
 ('it', 266069),
 ('for', 260027),
 ('was', 229373),
 ('of', 226444)]

In [1]:
#import nltk
#nltk.download('stopwords')

In [None]:
df = preprocess.Text_Preprocessing(df)
df.sample(n=5)

Unnamed: 0,review,rating
19663,my doctor put drug stop month long menstrual ...,3.0
95748,recent start use solodyn sever acn start see r...,10.0
57563,plaquenil mg twice day year help tremend cou...,10.0
77368,implanon almost three year work amaz didnt wor...,10.0
46080,iv xenic week iv lost pound far im averag p...,10.0


## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
class df_Preprocessor():

    
    def __init__(self):
        
        
        print("Preprocessor object created")
        
        
    def preprocess(self,data):
        
        data['rating'] = pd.to_numeric(data['rating'],errors='coerce')
        
        data['Sentiment'] = np.where(data['rating'] > 6, 1, 0)
        
        data= data[['review','Sentiment']]
        
        x = data['review']
        
        y = data['Sentiment']
        
        return train_test_split(x,y,test_size=0.2, random_state=0)

In [None]:
PR = df_Preprocessor()

Preprocessor object created


In [None]:
X_train, X_test, y_train, y_test = PR.preprocess(df)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((172050,), (43013,), (172050,), (43013,))

## Feature Engineering with Keras Tokenization and Pad Sequences

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from keras.utils import np_utils
from keras.models import Sequential

In [None]:
class Keras_Tokenizer():

    
    def __init__(self,max_features):
        
        self.max_features =6000
        
        
        print("Tokenizer object created")
        
        
    def __label_encoding(self,y_train):
        """
        Encode the given list of class labels
        :y_train_enc: returns list of encoded classes
        :labels: actual class labels
        """
        lbl_enc = LabelEncoder()

        y_train_enc = lbl_enc.fit_transform(y_train)
        labels = lbl_enc.classes_

        return y_train_enc, labels
    def __word_embedding(self,train, test, max_features, max_len=200):
        
        
        try:
            """ Keras Tokenizer class object """
            tokenizer = text.Tokenizer(num_words=max_features)
            tokenizer.fit_on_texts(train)

            train_data = tokenizer.texts_to_sequences(train)
            test_data = tokenizer.texts_to_sequences(test)

            """ Get the max_len """
            vocab_size = len(tokenizer.word_index) + 1

            """ Padd the sequence based on the max-length """
            x_train = sequence.pad_sequences(train_data, maxlen=max_len, padding='post')
            x_test = sequence.pad_sequences(test_data, maxlen=max_len, padding='post')
            """ Return train, test and vocab size """
            return tokenizer, x_train, x_test, vocab_size
        except ValueError as ve:
            raise(ValueError("Error in word embedding {}".format(ve)))
            
            
    def preprocess(self,X_train, X_test):
        
    
        return self.__word_embedding(X_train, X_test, self.max_features)

In [None]:
KT = Keras_Tokenizer(6000)

Tokenizer object created


In [None]:
tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)

In [None]:
x_pad_train.shape,x_pad_valid.shape,vocab_size

((172050, 200), (43013, 200), 58529)

## Modelling RNN

In [None]:
# Birectional lstm Architecture

In [None]:
from tensorflow import keras
class RNN_Bidirectional_lstm_Build_Pack():

    
    def __init__(self, input_length, output_length,vocab_size, optimizer,loss, metrics,batch_size,epochs,verbose):
        
        self.input_length =200
        self.output_length= 200
        self.vocab_size = 33068
        self.optimizer = 'adam'
        self.loss = 'binary_crossentropy'
        self.metrics = ['acc']
        self.batch_size = 256
        self.epochs = 20
        self.verbose = 1
        
        print("Tokenizer object created")
        
    def build_rnn(self,vocab_size,output_dim, input_dim):

        model = Sequential([
            keras.layers.Embedding(self.vocab_size,output_dim = self.output_length,
                                  input_length = self.input_length),
            keras.layers.BatchNormalization(),
            keras.layers.Bidirectional(keras.layers.LSTM(256,return_sequences=True)),
            keras.layers.GlobalMaxPool1D(),
            keras.layers.Dense(225,activation='relu'),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(150,activation='relu'),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(95,activation='relu'),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(64,activation='relu'),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(34,activation='relu'),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(32,activation='relu'),
            keras.layers.Dense(output_dim, activation='sigmoid')
        ])

        return model
    
    def Compile_and_Fit(self,rnn_model):
        
        try:
    
            rnn_model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics)

            rnn_model.fit(x_pad_train, 
                                    y_train,
                                    batch_size=self.batch_size,
                                   epochs=self.epochs,
                                   verbose= self.verbose)

            score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)

            print("Loss:%.3f Accuracy: %.3f" % (score[0], score[1]))

            return rnn_model
        
        except ValueError as Model_Error:
            raise(ValueError("Model Compiling Error {}".format(Model_Error)))

In [None]:
Rnn_Model = RNN_Bidirectional_lstm_Build_Pack(200,200,33068,'adam','binary_crossentropy',['acc'],256,10,1)

Tokenizer object created


In [None]:
rnn_model = Rnn_Model.build_rnn(vocab_size,1,200)
rnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 200)          6613600   
                                                                 
 batch_normalization (BatchN  (None, 200, 200)         800       
 ormalization)                                                   
                                                                 
 bidirectional (Bidirectiona  (None, 200, 512)         935936    
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 512)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 225)               115425    
                                                        

In [28]:
rnn_model = Rnn_Model.Compile_and_Fit(rnn_model)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss:0.363 Accuracy: 0.929


## Predict

In [29]:
y_preds = rnn_model.predict(x_pad_valid)

print("y_preds Shape ::",y_preds.shape)


for arr in y_preds:
    for i in range(len(arr)):
        if arr[i]>0.5:
            arr[i] = 1
        else:
            arr[i] = 0

            
y_preds = y_preds.astype('int32')

pred_df = pd.DataFrame(y_preds, columns=['pred'])

print(pred_df.shape)
pred_df.head()

y_preds Shape :: (43013, 1)
(43013, 1)


Unnamed: 0,pred
0,0
1,1
2,0
3,1
4,1


In [30]:
pred_df.value_counts()

pred
1       28396
0       14617
dtype: int64

Metrics

In [31]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(metrics.accuracy_score(y_test, pred_df))
        
print(metrics.confusion_matrix(y_test, pred_df))
        
print(metrics.classification_report(y_test, pred_df))

0.9293236928370493
[[13115  1538]
 [ 1502 26858]]
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     14653
           1       0.95      0.95      0.95     28360

    accuracy                           0.93     43013
   macro avg       0.92      0.92      0.92     43013
weighted avg       0.93      0.93      0.93     43013



Model Serialization

In [32]:
rnn_model.save("rnn_model.h5")

Tokenizer Serialization

In [33]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)