# Import Related Modules

In [10]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from __future__ import print_function

import string
import spacy
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Load Data

In [3]:
df = pd.read_csv('../data/processed/drug_review_clean.csv', index_col= False)

In [4]:
df.columns

Index(['id', 'drugName', 'condition', 'rating', 'date', 'usefulCount',
       'rating_category', 'review_clean', 'review_len', 'mean_sentence_len',
       'word_count', 'mean_word_len', 'unique_word_count',
       'sentiment_subjectivity', 'sentiment_score', 'sentiment_label',
       'genuine_positive', 'genuine_negative', 'genuine_neutral'],
      dtype='object')

# Preprocess

The dataframe contains different types of features: numericals ('mean_word_len','word_count', etc), categorical(eg.'rating_category','condidition','drugName'), and datetime ('date'). Also, The target of 'sentiment_label' is categorimcal. The preprocess including the following steps:
1. tokenizer the'review_clean' using keras Tokenizer
2. encode the categorical features and target 'sentiment_label'
3. extract the 'date' to several new features 'year','month','day'.
4. scale the numerical features using MinMaxScaler.
5. train test split

## text feature preprocess

In [50]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(df['review_clean'])
sequences = tokenizer.texts_to_sequences(df['review_clean'])
# check the type of sequences
print(type(sequences))
#covert sequences to a numpy array using pad_sequence()
X_text = pad_sequences(sequences)
# Check the shape of the X_test
print(X_text.shape)

<class 'list'>
(127987, 891)


## Scale numerical features

In [52]:
numerical_cols = ['rating', 'usefulCount',
        'review_len', 'mean_sentence_len',
       'word_count', 'mean_word_len', 'unique_word_count',
       'sentiment_subjectivity', 'sentiment_score']

scaler = MinMaxScaler()
X_numerical = scaler.fit_transform(df[numerical_cols])

# check the shape of the feature
print(X_numerical.shape)

(127987, 9)


## categorical features preprocess

In [None]:
# encode the categorica columns "drugName",  "condition", "rating_category", "sentiment_label"

# endcode drugName
drugName_encode = LabelEncoder()
X_drugName = drugName_encode.fit_transform(df['drugName'])

# encode condition
condition_encode = LabelEncoder()
X_condition = condition_encode.fit_transform(df['condition'])

# check the shape of features
print(X_drugName.shape)
print(X_condition.shape)

# Extract date features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# combine the extract features
X_comb1= np.concatenate([X_text,X_numerical, X_drugName.reshape(-1,1), X_condition.reshape(-1,1), df[['year','month','day']].values],axis = 1)
print(X_comb1.shape)

In [44]:
#encode rating_category
rating_category_encode = LabelEncoder()
X_rating_category = rating_category_encode.fit_transform(df['rating_category'])
# check the shape of features
print(X_rating_category.shape)

(127987,)


In [47]:
X_comb2= np.concatenate([X_comb1, X_rating_category.reshape(-1,1)],axis = 1)
print(X_comb2.shape)

(127987, 906)


## Preprocess of Target 'sentiment_label'

In [55]:
sentiment_label_encode = LabelEncoder()
y_encode = sentiment_label_encode.fit_transform(df['sentiment_label'])

In [57]:
from keras.utils import to_categorical
y = to_categorical(y_encode)

## Train Test Split

In [60]:
X_train,X_test,y_train, y_test = train_test_split(X_comb2, y, test_size = 0.25, random_state = 123)

In [61]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(95990, 906) (31997, 906) (95990, 3) (31997, 3)


## Compute Class Weights
the dataset is imbalanced, more postive sentiment label than negative label, here, the class weigth is used to treat the imblance data.

In [64]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train.argmax(axis =1)), y=y_train.argmax(1))

In [66]:
class_names = sentiment_label_encode.classes_
for class_name, weight in zip(class_names, class_weights):
    print(f"Class '{class_name}': Weight {weight}")

Class 'negative': Weight 1.261499237764811
Class 'neutral': Weight 3.1545565085937755
Class 'positive': Weight 0.529019173431653


# Modeling

## LSTM Model

In [68]:
lstm_model = Sequential()



