# Sentiment Product Reviews

build a learning-based classifier to classify the sentiments of product reviews

In [10]:
import os

## for path compatibility, if you are not running from app.py, please specify the project root path as working directory (there is no __file__ in jupyter notebook)
_root_path_ = 'D:\\_work\\Bi-Senti-EE6483'

if '_root_path_' in locals():
    os.chdir(_root_path_)
assert os.path.basename(os.getcwd()) == 'Bi-Senti-EE6483'

import configparser as cp

# storage for multiple data processing, for comparison of different methods
from enum import IntFlag, auto
class F(IntFlag): # data flow
    train = auto()
    test = auto()
    val = auto()
    preset1 = train | test | val
    preset2 = auto()
    preset3 = auto()

#****************************************************************************************************
# USER DEFINED HERE
dataflows = [
    [],
    [],
    []
]
#****************************************************************************************************

class DATA_CONTAINER(list):
    def __init__(self, dataflows):
        super().__init__([ [] for _ in range(len(dataflows)) ])
    def append(self, data):
        raise NotImplementedError('Append porhibited. Can only change sublists')
container = DATA_CONTAINER(dataflows)

## Reading in data

Sure we need to read the data into our program, the most commonly used library for reading data must be **pandas**

In [None]:
import pandas as pd
from copy import deepcopy

# Read the data
labeled_df = pd.read_json('data/train.json')
unlabeled_df = pd.read_json('data/test.json')

# Output the info
print("\nTrain DataFrame info:")
labeled_df.info()
print("\nTest DataFrame info:")
unlabeled_df.info()

for i in range(len(container)):
    container[i] = [deepcopy(labeled_df), deepcopy(unlabeled_df)]


## Data Processing

### Preprocessing

The data is not always clean and ready to use, we need to preprocess it before we can use it. The most common preprocessing steps include:

- Removing useless characters
- Tokenization
- Removing stopwords
- Lemmatization
- Stemming

In [None]:
# for all sentences, we first apply regular expression to remove all special characters
import re

def re_removal(text: str) -> str:
    text=re.sub('(<.*?>)', ' ', text)
    text=re.sub('[,\.!?:()"]', '', text)
    text=re.sub('[^a-zA-Z"]',' ',text)
    return text.lower()

# tokenizer is a function that splits a text(very long str) into words(list of str)
def tokenize(text: str, method: str) -> list:
    if method == 'split':
        return text.split()
    elif method == 'nltk':
        import nltk
        from nltk.tokenize import word_tokenize
        nltk.download('punkt')
        return word_tokenize(text)
    elif method == 'spacy':
        import spacy
        nlp_en_model = spacy.load("en_core_web_sm")
        return [token.text for token in nlp_en_model(text)]
    elif method == 'gensim':
        import gensim
        return gensim.utils.simple_preprocess(text)
    elif method == 'bert':
        from transformers import BertTokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        return tokenizer.tokenize(text)
    else:
        raise ValueError('method not supported')
        
def remove_stopwords(text: list, method: str) -> list:
    if method == 'nltk':
        import nltk
        from nltk.corpus import stopwords
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        return [word for word in text if word not in stop_words]
    elif method == 'spacy':
        import spacy
        nlp_en_model = spacy.load("en_core_web_sm")
        return [token.text for token in nlp_en_model(text) if not token.is_stop]
    else:
        raise ValueError('method not supported')

def lematize(text: str, method: str) -> list:
    if method == 'nltk':
        import nltk
        from nltk.stem import WordNetLemmatizer
        nltk.download('wordnet')
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(word) for word in text]
    elif method == 'spacy':
        import spacy
        nlp_en_model = spacy.load("en_core_web_sm")
        return [token.lemma_ for token in nlp_en_model(text)]
    else:
        raise ValueError('method not supported')


### Embedding

When we get the data, they are characters rather than numbers, so we need to convert them. This procedure is called embedding.

- Word2Vec
- GloVe
- FastText
- BERT
- etc.

In [None]:
from gensim.models import Word2Vec, FastText
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from transformers import BertTokenizer, BertModel
import torch

def get_embeddings(texts, method='word2vec'):
    if method == 'word2vec':
        model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=1, workers=4)
        embeddings = [model.wv[text] for text in texts]
    elif method == 'glove':
        glove_file = 'data/glove.6B.100d.txt'
        glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)
        embeddings = [[glove_model[word] for word in text if word in glove_model] for text in texts]
    elif method == 'fasttext':
        model = FastText(sentences=texts, vector_size=100, window=5, min_count=1, workers=4)
        embeddings = [model.wv[text] for text in texts]
    elif method == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        embeddings = []
        for text in texts:
            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
    else:
        raise ValueError('method not supported')
    
    return embeddings


#texts = [tokenize(re_removal(review), method='split') for review in labeled_df['reviews']]
#embeddings = get_embeddings(texts, method='word2vec')

## Model Selection



### Traditional Machine Learning Models

- SVM
- Extreme Learning Machine （One layer neural network）
- Gaussian Process
- Random Forest/ XGBoost/ LightGBM (Not Included)
- Linear (Not Included)

### Deep Learning Models

- RNN
- UniRNN
- LSTM
- BiLSTM
- GRU
- Bert
- Roberta
- DistilBert
- Albert
- etc.

## Training and Evaluation

In [None]:

def model_train(method: str, train_test_data: pd.DataFrame):
    if method == 'SVM':
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.svm import SVC
        from sklearn.pipeline import make_pipeline
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        model = make_pipeline(TfidfVectorizer(), SVC())
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'ELM':
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics import accuracy_score
        from skelm import ELM
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        model = ELM(X_train.shape[1], X_train.shape[1])
        model.add_neurons(100, 'sigm')
        model.train(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'GaussianProcess':
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.gaussian_process import GaussianProcessClassifier
        from sklearn.gaussian_process.kernels import RBF
        from sklearn.pipeline import make_pipeline
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        model = make_pipeline(TfidfVectorizer(), GaussianProcessClassifier(kernel=RBF()))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'RNN':
        from keras.preprocessing.text import Tokenizer
        from keras.preprocessing.sequence import pad_sequences
        from keras.models import Sequential
        from keras.layers import Embedding, LSTM, Dense
        from keras.optimizers import Adam
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)
        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)
        X_train = pad_sequences(X_train, padding='post')
        X_test = pad_sequences(X_test, padding='post')
        model = Sequential()
        model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=X_train.shape[1]))
        model.add(LSTM(100))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=10, batch_size=64)
        y_pred = model.predict(X_test)
        y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'UniRNN':
        from keras.preprocessing.text import Tokenizer
        from keras.preprocessing.sequence import pad_sequences
        from keras.models import Sequential
        from keras.layers import Embedding, LSTM, Dense
        from keras.optimizers import Adam
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)
        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)
        X_train = pad_sequences(X_train, padding='post')
        X_test = pad_sequences(X_test, padding='post')
        model = Sequential()
        model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=X_train.shape[1]))
        model.add(LSTM(100, return_sequences=True))
        model.add(LSTM(100))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=10, batch_size=64)
        y_pred = model.predict(X_test)
        y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'LSTM':
        from keras.preprocessing.text import Tokenizer
        from keras.preprocessing.sequence import pad_sequences
        from keras.models import Sequential
        from keras.layers import Embedding, LSTM, Dense
        from keras.optimizers import Adam
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)
        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)
        X_train = pad_sequences(X_train, padding='post')
        X_test = pad_sequences(X_test, padding='post')
        model = Sequential()
        model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=X_train.shape[1]))
        model.add(LSTM(100, return_sequences=True))
        model.add(LSTM(100, return_sequences=True))
        model.add(LSTM(100))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=10, batch_size=64)
        y_pred = model.predict(X_test)
        y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'BiLSTM':
        from keras.preprocessing.text import Tokenizer
        from keras.preprocessing.sequence import pad_sequences
        from keras.models import Sequential
        from keras.layers import Embedding, LSTM, Dense, Bidirectional
        from keras.optimizers import Adam
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)
        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)
        X_train = pad_sequences(X_train, padding='post')
        X_test = pad_sequences(X_test, padding='post')
        model = Sequential()
        model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=X_train.shape[1]))
        model.add(Bidirectional(LSTM(100)))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=10, batch_size=64)
        y_pred = model.predict(X_test)
        y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'GRU':
        from keras.preprocessing.text import Tokenizer
        from keras.preprocessing.sequence import pad_sequences
        from keras.models import Sequential
        from keras.layers import Embedding, GRU, Dense
        from keras.optimizers import Adam
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)
        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)
        X_train = pad_sequences(X_train, padding='post')
        X_test = pad_sequences(X_test, padding='post')
        model = Sequential()
        model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=X_train.shape[1]))
        model.add(GRU(100))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=10, batch_size=64)
        y_pred = model.predict(X_test)
        y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'BERT':
        from transformers import BertTokenizer, BertModel
        import torch
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        X_train = [tokenizer(review, return_tensors='pt', padding=True, truncation=True) for review in X_train]
        X_test = [tokenizer(review, return_tensors='pt', padding=True, truncation=True) for review in X_test]
        X_train = [model(**inputs).last_hidden_state.mean(dim=1).detach().numpy() for inputs in X_train]
        X_test = [model(**inputs).last_hidden_state.mean(dim=1).detach().numpy() for inputs in X_test]
        model = SVC()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'RoBERTa':
        from transformers import RobertaTokenizer, RobertaModel
        import torch
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaModel.from_pretrained('roberta-base')
        X_train = [tokenizer(review, return_tensors='pt', padding=True, truncation=True) for review in X_train]
        X_test = [tokenizer(review, return_tensors='pt', padding=True, truncation=True) for review in X_test]
        X_train = [model(**inputs).last_hidden_state.mean(dim=1).detach().numpy() for inputs in X_train]
        X_test = [model(**inputs).last_hidden_state.mean(dim=1).detach().numpy() for inputs in X_test]
        model = SVC()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
    elif method == 'DistillBert':
        from transformers import DistilBertTokenizer, DistilBertModel
        import torch
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        
        X_train, X_test, y_train, y_test = train_test_split(train_test_data['reviews'], train_test_data['sentiment'], test_size=0.2, random_state=42)
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        X_train = [tokenizer(review, return_tensors='pt', padding=True, truncation=True) for review in X_train]
        X_test = [tokenizer(review, return_tensors='pt', padding=True, truncation=True) for review in X_test]
        X_train = [model(**inputs).last_hidden_state.mean(dim=1).detach().numpy() for inputs in X_train]
        X_test = [model(**inputs).last_hidden_state.mean(dim=1).detach().numpy() for inputs in X_test]
        model = SVC()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
    else:
        raise ValueError('method not supported')

## Prediction