In [None]:
#Code from this notebook is referenced from https://www.kaggle.com/sanxuwen/shopee-sentiment-analysis-2nd-place-solution

#training sample and paramaters in this notebook are much smaller than in an actual competition to save on training time. In addition,
#if an actual submission is desired, the classification results should be averaged out between most/all the models listed.

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from transformers import TFAutoModel ,TFAutoModelForSequenceClassification, AutoTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
import os
import emoji
import re

In [None]:
os.chdir("/kaggle/input/shopee-code-league-20/_DS_Sentiment_Analysis/")

In [None]:
train_data = pd.read_csv("train.csv")

In [None]:
test_data = pd.read_csv("test.csv")

In [None]:
X = train_data.review
y = train_data.rating

In [None]:
#Preprocessor
#Data cleaning functions emoji_cleaning, review_cleaning, and encode function from 
    #https://www.kaggle.com/sanxuwen/shopee-sentiment-analysis-2nd-place-solution

#I have decided not to one hot encode the targets
class Sentence_Preprocessor():
    def __init__(self, tokenizer, MAX_LEN = 300):
        self._tokenizer = tokenizer
        self._MAX_LEN = MAX_LEN
    
    
    #Cleaning out all emojis
    def emoji_cleaning(self, text):

        # Change emoji to text
        text = emoji.demojize(text).replace(":", " ")

        # Delete repeated emoji
        tokenizer = text.split()
        repeated_list = []

        for word in tokenizer:
            if word not in repeated_list:
                repeated_list.append(word)

        text = ' '.join(text for text in repeated_list)
        text = text.replace("_", " ").replace("-", " ")
        return text
    
    #Shortening repeated words or words with drawn out letters like "shoooooppppppeeeeeeeeeee"
    def review_cleaning(self, text):

        # delete lowercase and newline
        text = text.lower()
        text = re.sub(r'\n', '', text)
        text = re.sub('([.,!?()])', r' \1 ', text)
        text = re.sub('\s{2,}', ' ', text)

        # change emoticon to text
        text = re.sub(r':\(', 'dislike', text)
        text = re.sub(r': \(\(', 'dislike', text)
        text = re.sub(r':, \(', 'dislike', text)
        text = re.sub(r':\)', 'smile', text)
        text = re.sub(r';\)', 'smile', text)
        text = re.sub(r':\)\)\)', 'smile', text)
        text = re.sub(r':\)\)\)\)\)\)', 'smile', text)
        text = re.sub(r'=\)\)\)\)', 'smile', text)


        tokenizer = text.split()

        return ' '.join([text for text in tokenizer])
    
    def clean_sentence(self, sentence):
        
        sentence = self.emoji_cleaning(sentence)
        
        sentence = self.review_cleaning(sentence)
        
        return sentence

    
    def preprocess_sentence(self, sentence):
        
        sentence = self.clean_sentence(sentence)
        
        sentence = self._tokenizer.encode(sentence, pad_to_max_length = True, max_length = self._MAX_LEN)
        
        return sentence
    
    def one_hot_encode(self, y):
        
        y = np.array(y)
        
        y_post = np.zeros(y.shape[0] * 5).reshape(y.shape[0], 5)
        
        for index in range(y.shape[0]):
            y_post[index, y[index]-1] = 1
            
        return y_post
    
    def preprocess(self, X, y = None):
        
        #This function assumes X and y are pd Series, since thats how the data will be read
        
        X = X.apply(self.clean_sentence)
        
        X = self._tokenizer.batch_encode_plus(
                 X, 
                 return_attention_masks=True, 
                 return_token_type_ids=False,
                 pad_to_max_length=True,
                 max_length=PADDED_LEN)
        
        if y is not None:
            return np.array(X['input_ids']), self.one_hot_encode(y)
        else:
            return np.array(X['input_ids'])

In [None]:
#Pipeline
class Pipeline():
    
    def __init__(self, tokenizer, model, max_len = PADDED_LEN):
        
        self._model = model
        self._preprocessor = Sentence_Preprocessor(tokenizer, max_len)
        
        
    def preprocess(self, X_train, X_test, y_train, y_test):
        
        print('preprocessing...')
        
        X_train, y_train = self._preprocessor.preprocess(X_train, y_train)
        
        X_test, y_test = self._preprocessor.preprocess(X_test, y_test)
        
        print('finished preprocessing!')
        
        return X_train, X_test, y_train, y_test
    
    
    def to_tensorflow_dataset(self, X_train, X_test, y_train, y_test, batch_size):
        
        print('making Tensorflow Dataset...')
        
        train_dataset = (tf.data.Dataset
                        .from_tensor_slices((X_train, y_train))
                        .repeat()
                        .shuffle(20)
                        .batch(BATCH_SIZE))
        
        test_dataset = (tf.data.Dataset
                        .from_tensor_slices((X_test, y_test))
                        .shuffle(20)
                        .batch(BATCH_SIZE))
        
        print('completed')
        
        return train_dataset, test_dataset
        
    def fit(self, X_train, X_test, y_train, y_test, epochs = 2, batch_size = 100, auto_preprocess = True):
        
        if auto_preprocess:
            X_train, X_test, y_train, y_test = self.preprocess(X_train, X_test, y_train, y_test)
        
        train_dataset, test_dataset = self.to_tensorflow_dataset(X_train, X_test, y_train, y_test, batch_size)

        print('training...')
        
        self._model.fit(
            train_dataset,
            steps_per_epoch = X_train.shape[0],
            validation_data = test_dataset,
            epochs = epochs)
        
        
    def predict_single_review(self, X_pred, final_category_only = False):
        
        X_pred = np.array(self._preprocessor.preprocess_sentence(X_pred))
        X_pred = X_pred.reshape((1, X_pred.shape[0]))
        
        if final_category_only:
            return np.argmax(model.predict(X_pred), axis = 1) + 1 #add 1 because ratings are 1-5, not 0-4
        return model.predict(X_pred)
    
    
    def predict_many_reviews(self, X_pred, final_category_only = False):
        
        X_pred = self._preprocessor.preprocess(X_pred)
        
        X_pred = (tf.data.Dataset
                  .from_tensor_slices(X_pred)
                  .batch(20))
        
        if final_category_only:
            return np.argmax(model.predict(X_pred), axis = 1) + 1
        return model.predict(X_pred)
    
    
    def get_preprocessor(self):
        return self._preprocessor

In [None]:
# emoji_index = train_data.apply(lambda seq : any(char in emoji.UNICODE_EMOJI for char in seq))
# train_data[emoji_index] = train_data[emoji_index].apply(emoji_cleaning)
# train_data = train_data.apply(review_cleaning)

In [None]:
#Bert model from https://huggingface.co/transformers/model_doc/auto.html

base_model_names = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base', 'xlm-mlm-en-2048'] #'xlnet-base-cased'

# base_model_name = base_model_names[np.random.randint(len(base_model_names))]

base_model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

base_model = TFAutoModelForSequenceClassification.from_pretrained(base_model_name)

base_model.layers[0].trainable = False

In [None]:
#Visualize the average length of a review after cleaning
import seaborn as sns
import matplotlib.pyplot as plt

#the first 1000 reviews were used to save time
reviews = train_data['review'][:1000]

#create a quick preprocessor to clean the data
preprocessor = Sentence_Preprocessor(tokenizer)

reviews = reviews.apply(preprocessor.clean_sentence)

sns.distplot(reviews.str.len())
plt.show

In [None]:
#Padding reviews to a length of 300 should be safe
#Defining Hyper Parameters here
PADDED_LEN = 5 #300
EPOCHS = 5
BATCH_SIZE = 10

In [None]:
# code from https://towardsdatascience.com/working-with-hugging-face-transformers-and-tf-2-0-89bf35e3555a
def build_model(base_model, num_labels):
    
    inputs = tf.keras.layers.Input(shape = (PADDED_LEN,), dtype=tf.int32)
    
# TODO in the future: add attention masks
    
    embeddings = base_model(inputs)[0] #[:,0,:] 
    #the results of the model are wrapped in a tuple (), so we index [0] to extract the results
    
    out = tf.keras.layers.Reshape((1,num_labels))(embeddings)
    
    out = tf.keras.layers.Dropout(0.2)(out)
    
    out = tf.keras.layers.Conv1D(num_labels * 8, 1, padding = "causal", activation = 'relu')(out)
    
    out = tf.keras.layers.Conv1D(num_labels * 4, num_labels, padding = "causal", activation = 'relu')(out)
    
    out = tf.keras.layers.Conv1D(num_labels, num_labels, padding = "causal", activation = 'relu')(out)
    
    out = tf.keras.layers.GlobalAveragePooling1D()(out)
    
    added = tf.keras.layers.Add()([embeddings, out]) #Residual connection
    
    out = tf.keras.layers.Dense(5, activation = 'softmax')(added)
    
    model = tf.keras.Model(inputs = inputs, outputs = out)
    
    model.compile(optimizer = "adam", loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [None]:
model = build_model(base_model, base_model.config.num_labels)

In [None]:
model.summary()

In [None]:
pipeline = Pipeline(tokenizer, model)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train, X_test, y_train, y_test)

In [None]:
#When working with tensorflor Datasets, using np is much easier than pd. make sure X is a 2D array, and y is a 1D array
# train_dataset = (
#     tf.data.Dataset
#     .from_tensor_slices((X_train, y_train))
#     .repeat()
#     .shuffle(1024)
#     .batch(20)
# )

# !!! Note about this model and tensorflow Datasets:
# A single sentence currently has a shape of (PADDED_LEN,). Thus, when trying to fit/predict for a single sentence, the model reads the
    #shape of the sentence, sees shape[0] is PADDED_LEN, and assumes that there are PADDED_LEN sentences instead of just 1 sentence. 
    
    #Tensorflow Dataset batch method solves this problem since when batching, a batch dimension is added to the shape
    #of the sample: (BATCH_SIZE, _shape_). Hence, when we pass the tensorflow batches to the model, the model reads 
    #that there are BATCH_SIZE number of sentences (which is correct), and outputs the right number of predictions.
    
    #An alternative for using tensorflow dataset batch to predict a sentence is to reshape the sentence into 
    #shape (1, PADDED_LEN). That way, the model will see shape[0] is 1 and correctly think that there is only 1 sentence
    
    #In light of this, it is clear that for training/predictin large datasets, we should use TFDS batch, while if only predicting for a
    #single sentence, reshaping is more efficient
    
#TLDR: Problem: when predicting for a single sentence, model outputs PADDED_LEN different predictions
    #Reason: a single sentence has shape (PADDED_LEN,). Model misinterprets the input as having PADDED_LEN sentences instead.
    #Solution: reshape the sentence into shape (1,PADDED_LEN) or use Tensorflow Datasets batch method to batch many sentences together.

In [None]:
pipeline.predict_many_reviews(train_data.review[:13], final_category_only=True)