In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from __future__ import print_function

import string
import spacy
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold,GridSearchCV

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,Dropout
from scikeras.wrappers import  KerasClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
from keras.utils import to_categorical

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC




In [2]:
df = pd.read_csv('../data/processed/drug_review_clean.csv', index_col= False)

# Preprocess

Build a class to do preprocess
The dataframe contains different types of features: numericals ('mean_word_len','word_count', etc), categorical(eg.'rating_category','condidition','drugName'), and datetime ('date'). Also, The target of 'sentiment_label' is categorimcal. The preprocess including the following steps:

tokenizer the'review_clean' using keras Tokenizer
encode the categorical features and target 'sentiment_label'
extract the 'date' to several new features 'year','month','day'.
scale the numerical features using MinMaxScaler.
train test split

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, max_sequence_length=214):
        self.max_sequence_length = max_sequence_length
        self.tokenizer = None
    """ define a function to convert text to tokenizer """
    def fit(self, X, y=None):
        self.tokenizer = Tokenizer(num_words = 5000, lower = False)
        self.tokenizer.fit_on_texts(X)
        return self
    """ define a function to convert the review text into sequence """

    def transform(self, X):
        sequences = self.tokenizer.texts_to_sequences(X)
        return pad_sequences(sequences, maxlen= 200)


In [4]:
class NumericalScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.scaler = MinMaxScaler()
        self.scaler.fit(X)
        return self

    def transform(self, X):
        return self.scaler.transform(X)

In [5]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(X)
        return self

    def transform(self,X):
        return self.label_encoder.transform(X).reshape(-1,1)

In [6]:
class DateExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self,X):
        X['date'] = pd.to_datetime(X['date'])
        X['year'] = X['date'].dt.year
        X['month'] = X['date'].dt.month
        X['day'] = X['date'].dt.day
        return X[['year','month','day']]               ## should be X[['year','month','day']].values],axis= 1?

In [7]:
X = df[
    [
        "drugName",
        "condition",
        "rating",
        "date",
        "usefulCount",
        "rating_category",
        "review_clean",
        "review_len",
        "mean_sentence_len",
        "word_count",
        "mean_word_len",
        "unique_word_count",
        "sentiment_subjectivity",
        "sentiment_score",
        "genuine_positive",
        "genuine_negative",
        "genuine_neutral",
    ]
]
y = df[["sentiment_label"]]

In [8]:
# Transfrom each feature
df["review_clean"] = df["review_clean"].apply(lambda x: " ".join(x.split()[:200]))
X_text = TextPreprocessor(max_sequence_length=214).fit_transform(df["review_clean"])
numerical_cols = [
    "rating",
    "usefulCount",
    "review_len",
    "mean_sentence_len",
    "word_count",
    "mean_word_len",
    "unique_word_count",
    "sentiment_subjectivity",
    "sentiment_score",
]
X_numerical = NumericalScaler().fit_transform(df[numerical_cols])
X_drugName = CategoricalEncoder().fit_transform(df["drugName"])
X_condition = CategoricalEncoder().fit_transform(df["condition"])
X_date = DateExtractor().fit_transform(df)

In [9]:
#combine the features
X_transformed = np.concatenate([X_text, X_numerical, X_drugName,
                                X_condition, X_date], axis=1)

In [10]:
#encode targe feature
sentiment_label_encode = LabelEncoder()
y_encode = sentiment_label_encode.fit_transform(df['sentiment_label'])
#y = to_categorical(y_encode)

In [11]:
X_train,X_test,y_train, y_test = train_test_split(X_transformed, y_encode, test_size = 0.25, random_state = 123)

In [12]:
print(np.unique(y_train))

[0 1 2]


In [13]:
from sklearn.utils.class_weight import compute_class_weight
unique_labels= np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=unique_labels, y=y_train)
class_weight_dict = dict(zip(unique_labels, class_weights))

In [14]:
# class_names = sentiment_label_encode.classes_
# for class_name, weight in zip(class_names, class_weights):
#     print(f"Class '{class_name}': Weight {weight}")
# class_weight_dict = {0: class_weights[0], 1: class_weights[1], 2:class_weights[2]}

# Modeling

## LSTM model

In [15]:
def create_lstm_model(dropout_rate=0.2, epochs=3, batch_size=64):
    lstm_model = Sequential()
    lstm_model.add(Embedding(input_dim=5000, output_dim=32, input_length=214))
    lstm_model.add(LSTM(100))
    lstm_model.add(Dropout(0.2))
    lstm_model.add(Dense(3, activation="softmax"))
    lstm_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    # lstm_model.summary()
    return lstm_model

In [16]:
# # print(X_train.dtype, X_test.dtype)
# X_train_str= X_train.astype(str)
# X_test_str= X_test.astype(str)
# X_train_str_lower = [' '.join(str(x).lower().split()) for x in X_train]
# X_test_str_lower = [' '.join(str(x).lower().split()) for x in X_test]


In [17]:
# #trucate the review to the first 200 words
# # df['review_clean'] = df['review_clean'].apply(lambda x : ' '.join(x.split()[:200]))
# tokenizer = Tokenizer(num_words = 5000,lower = False)
# tokenizer.fit_on_texts(df['review_clean'])

In [18]:
# type(X_train)

In [19]:
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()

In [20]:
# sequences_train = tokenizer.texts_to_sequences(X_train_list)
# sequences_test = tokenizer.texts_to_sequences(X_test_list)

In [21]:
# X_text_train = pad_sequences(sequences_train,maxlen=265)
# X_text_test = pad_sequences(sequences_test,maxlen=265)

In [22]:
text_preprocessor = TextPreprocessor(max_sequence_length=214)
X_text_train = text_preprocessor.fit_transform(X_train_list)
X_text_test = text_preprocessor.transform(X_test_list)

In [23]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
lstm_model = KerasClassifier(
    model=create_lstm_model, dropout_rate=None, epochs=None, batch_size=None, verbose=0
)

In [24]:
param_grids = {
    'dropout_rate': [0.0, 0.2, 0.3],
    'epochs': [3, 5],
    'batch_size': [32, 64]
}

In [25]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
lstm_grid = GridSearchCV(
    estimator=lstm_model,
    param_grid=param_grids,
    cv=kfold,
    scoring="accuracy",
    verbose=0, error_score='raise'
)

In [26]:
print(X_train.shape)
print(y_train.shape)

(95990, 214)
(95990,)


In [28]:
# Supress tensorflow warning
import tensorflow as tf
tf.compat.v1.get_default_graph()

tf.compat.v1.train.Optimizer

tf.compat.v1.ragged.RaggedTensorValue

tf.compat.v1.executing_eagerly_outside_functions

<function tensorflow.python.framework.ops.executing_eagerly_outside_functions()>

In [None]:
lstm_grid_result = lstm_grid.fit(
    X_train, y_train,class_weight=class_weight_dict
)

In [None]:
best_lstm_params = lstm_grid_result.best_params_
best_lstm_model = lstm_model(**best_lstm_params)
best_lstm_model.fit(X_train, y_train)

In [None]:
lstm_pred = best_lstm_model.predict(X_test)
lstm_accuracy = accuracy_score(y_test), lstm_pred)
print("\n LSTM model performance: ")
print(f'best hyperparameters:{best_lstm_params}')
print(f'accuracy:{lstm_accuracy}')

## SVC model

In [None]:
svc_model = SVC()

In [None]:
svc_param_grid = {'C': [0.1,1, 10], 'gamma': [1,0.1,0.01],'kernel': ['rbf', 'poly', 'sigmoid']}

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
svc_grid = GridSearchCV(estimator=svc_model, param_grid=svc_param_grid, scoring='accuracy', cv=kfold, verbose=0)
svc_grid_results = svc_grid.fit(X_train, y_train.argmax(axis=1))

In [None]:
best_svc_params = svc_grid_result.best_params_
best_svc_model = svc_model(**best_lstm_params)
best_svc_model.fit(X_train, y_train.argmax(axis = 1))

In [None]:
svc_pred = best_svc_model.predict(X_test)
svc_accuracy = accuracy_score(y_test.argmax(axis =1), svc_pred)
print("\n svc model performance: ")
print(f'best hyperparameters:{best_svc_params}')
print(f'accuracy:{svc_accuracy}')

## Multinomial Naive Bayes Model (MNB)

In [None]:
mnb_model = MultinomialNB()


In [None]:
mnb_param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],
          'fit_prior': [True, False]
         }

mnb_grid = GridSearchCV(mnb_model, param_grid=mnb_param_grid, n_jobs=-1, cv=5, verbose=5)
mnb_grid.fit(X_train,y_train)

print('Best Accuracy Through Grid Search : {:.3f}'.format(mnb_grid.best_score_))
print('Best Parameters : {}\n'.format(mnb_grid.best_params_))

In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_preds = mnb_grid.best_estimator_.predict(X_test)
y_preds_train = mnb_grid.best_estimator_.predict(X_train)

print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, y_preds)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, y_preds_train)))
print("\nClassification Report :")
print(classification_report(y_test, y_preds))