# Import Related Modules

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from __future__ import print_function

import string
import spacy
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold,GridSearchCV

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,Dropout
from scikeras.wrappers import  KerasClassifier




# Load Data

In [2]:
df = pd.read_csv('../data/processed/drug_review_clean.csv', index_col= False)

In [3]:
df.columns

Index(['id', 'drugName', 'condition', 'rating', 'date', 'usefulCount',
       'rating_category', 'review_clean', 'review_len', 'mean_sentence_len',
       'word_count', 'mean_word_len', 'unique_word_count',
       'sentiment_subjectivity', 'sentiment_score', 'sentiment_label',
       'genuine_positive', 'genuine_negative', 'genuine_neutral'],
      dtype='object')

# Preprocess

The dataframe contains different types of features: numericals ('mean_word_len','word_count', etc), categorical(eg.'rating_category','condidition','drugName'), and datetime ('date'). Also, The target of 'sentiment_label' is categorimcal. The preprocess including the following steps:
1. tokenizer the'review_clean' using keras Tokenizer
2. encode the categorical features and target 'sentiment_label'
3. extract the 'date' to several new features 'year','month','day'.
4. scale the numerical features using MinMaxScaler.
5. train test split

## text feature preprocess

In [4]:
#trucate the review to the first 200 words
df['review_clean'] = df['review_clean'].apply(lambda x : ' '.join(x.split()[:200]))

tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(df['review_clean'])
sequences = tokenizer.texts_to_sequences(df['review_clean'])
# check the type of sequences
print(type(sequences))
#covert sequences to a numpy array using pad_sequence()
X_text = pad_sequences(sequences, maxlen = 250)
# Check the shape of the X_test
print(X_text.shape)

<class 'list'>
(127987, 250)


## Scale numerical features

In [5]:
numerical_cols = ['rating', 'usefulCount',
        'review_len', 'mean_sentence_len',
       'word_count', 'mean_word_len', 'unique_word_count',
       'sentiment_subjectivity', 'sentiment_score']

scaler = MinMaxScaler()
X_numerical = scaler.fit_transform(df[numerical_cols])

# check the shape of the feature
print(X_numerical.shape)

(127987, 9)


## Categorical Features Preprocess

In [6]:
# encode the categorica columns "drugName",  "condition", "rating_category", "sentiment_label"

# endcode drugName
drugName_encode = LabelEncoder()
X_drugName = drugName_encode.fit_transform(df['drugName'])

# encode condition
condition_encode = LabelEncoder()
X_condition = condition_encode.fit_transform(df['condition'])

# check the shape of features
print(X_drugName.shape)
print(X_condition.shape)

# Extract date features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# combine the extract features
X_comb1= np.concatenate([X_text,X_numerical, X_drugName.reshape(-1,1), X_condition.reshape(-1,1), df[['year','month','day']].values],axis = 1)
print(X_comb1.shape)

(127987,)
(127987,)
(127987, 264)


In [7]:
#encode rating_category
rating_category_encode = LabelEncoder()
X_rating_category = rating_category_encode.fit_transform(df['rating_category'])
# check the shape of features
print(X_rating_category.shape)

(127987,)


In [8]:
X_comb2= np.concatenate([X_comb1, X_rating_category.reshape(-1,1)],axis = 1)
print(X_comb2.shape)

(127987, 265)


## Preprocess of Target 'sentiment_label'

In [9]:
sentiment_label_encode = LabelEncoder()
y_encode = sentiment_label_encode.fit_transform(df['sentiment_label'])

In [10]:
from keras.utils import to_categorical
y = to_categorical(y_encode,3)

## Train Test Split

In [11]:
X_train,X_test,y_train, y_test = train_test_split(X_comb2, y, test_size = 0.25, random_state = 123)

In [12]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(95990, 265) (31997, 265) (95990, 3) (31997, 3)


## Compute Class Weights
the dataset is imbalanced, more postive sentiment label than negative label, here, the class weigth is used to treat the imblance data.

In [13]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train.argmax(axis =1)), y=y_train.argmax(1))

In [14]:
class_names = sentiment_label_encode.classes_
for class_name, weight in zip(class_names, class_weights):
    print(f"Class '{class_name}': Weight {weight}")

Class 'negative': Weight 1.261499237764811
Class 'neutral': Weight 3.1545565085937755
Class 'positive': Weight 0.529019173431653


In [15]:
class_weight_dict = {0: class_weights[0], 1: class_weights[1], 2:class_weights[2]}

# Modeling

## LSTM Model

In [16]:

def create_lstm_model(dropout_rate = 0.2, epochs = 3, batch_size = 64):
    lstm_model = Sequential()
    lstm_model.add(Embedding(input_dim = 5000, output_dim = 32,input_length= 265))
    lstm_model.add(LSTM(100))
    lstm_model.add(Dropout(0.2))
    lstm_model.add(Dense(3,activation='softmax'))
    lstm_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    lstm_model.summary()
    return lstm_model

In [17]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

In [18]:
from sklearn.pipeline import Pipeline

lstm_model = Pipeline([('keras_classifier', KerasClassifier(model=create_lstm_model,  dropout_rate=None, epochs=None, batch_size=None, verbose=0))])

In [19]:
lstm_model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'keras_classifier', 'keras_classifier__model', 'keras_classifier__build_fn', 'keras_classifier__warm_start', 'keras_classifier__random_state', 'keras_classifier__optimizer', 'keras_classifier__loss', 'keras_classifier__metrics', 'keras_classifier__batch_size', 'keras_classifier__validation_batch_size', 'keras_classifier__verbose', 'keras_classifier__callbacks', 'keras_classifier__validation_split', 'keras_classifier__shuffle', 'keras_classifier__run_eagerly', 'keras_classifier__epochs', 'keras_classifier__dropout_rate', 'keras_classifier__class_weight'])

In [20]:
y_train_label = np.argmax(y_train, axis = 1)


In [21]:
param_grids = {
    'keras_classifier__dropout_rate': [0.0, 0.2, 0.3],
    'keras_classifier__epochs': [3, 5],
    'keras_classifier__batch_size': [32, 64]
}

In [22]:
grid = GridSearchCV(estimator=lstm_model, param_grid=param_grids, scoring='accuracy', cv=kfold, verbose=0)

In [23]:
grid.fit(X_train,y_train_label, keras_classifier__class_weight=class_weight_dict)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 265, 32)           160000    
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 3)                 303       
                                                                 
Total params: 213503 (834.00 KB)
Trainable params: 213503 (834.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Par

ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\scikeras\wrappers.py", line 1491, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\scikeras\wrappers.py", line 760, in fit
    self._fit(
  File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\scikeras\wrappers.py", line 928, in _fit
    self._fit_keras_model(
  File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\scikeras\wrappers.py", line 524, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\YANZHA~1\AppData\Local\Temp\__autograph_generated_fileb9ie4umd.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
ValueError: in user code:

    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\engine\training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\engine\training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\Yan Zhang\anaconda3\envs\Capstone_Three_Project\lib\site-packages\keras\src\backend.py", line 5573, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 3) are incompatible



In [None]:
print("Best: %f using %s" %(grid.best_score_, grid.best_params_))