# Download data from Kaggle

In [99]:
from google.colab import files
files.upload() #upload kaggle.json

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

!kaggle datasets download -d hienbm/vietnamese-ecommerce-review

Saving kaggle.json to kaggle (3).json
kaggle.json
vietnamese-ecommerce-review.zip: Skipping, found more recently modified local copy (use --force to force download)


# Download library and preparing dataset

In [100]:
!pip install emot
!pip install emoji
# !pip install vncorenlp
# !mkdir -p vncorenlp/models/wordsegmenter
# !wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
# !wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
# !wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
# !mv VnCoreNLP-1.1.1.jar vncorenlp/ 
# !mv vi-vocab vncorenlp/models/wordsegmenter/
# !mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/
!pip install pyvi



In [101]:
import pandas as pd

typo_mistake = pd.read_table('https://raw.githubusercontent.com/HienBM/text_normalization/main/correct_typo_mistake.txt', delimiter='\t')
dict_typo_mistake = dict(zip(typo_mistake['word'], typo_mistake['typo_mistake']))

vietnamese_word_dict = pd.read_table("https://raw.githubusercontent.com/undertheseanlp/dictionary/hongocduc/data/Viet74K.txt")

en2vi_path = pd.read_table('https://raw.githubusercontent.com/HienBM/text_normalization/main/top_500_adjective_eng.txt', delimiter='\t')
en2vi_dict = dict(zip(en2vi_path['Adj'], en2vi_path['Mean']))

font_correct_path = pd.read_table('https://raw.githubusercontent.com/HienBM/text_normalization/main/correct_char.txt', delimiter='\t')
font_correct_dict = dict(zip(font_correct_path['char'], font_correct_path['correct_char']))

In [102]:
import pandas as pd

df = pd.read_csv('/content/vietnamese-ecommerce-review.zip', compression='zip')
df = df.dropna().drop_duplicates()
df = df[(df['score'] > 0) & (len(df['content']) > 0)]
df = df.sample(frac=1).reset_index(drop=True)
df = df[0:100000]

# Preprocessing data

In [103]:
from unicodedata import digit
import re
import string
import emoji
import functools
import operator
# from vncorenlp import VnCoreNLP
from pyvi import ViTokenizer, ViPosTagger, ViUtils
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings("ignore")


def remove_duplicate_emoji(orig_str):
    prev_emoji = None
    remove_duplicate_emoji = []
    for c in orig_str:
        if c in UNICODE_EMOJI:
            if prev_emoji == c:
                continue
            prev_emoji = c
        remove_duplicate_emoji.append(c)
    return "".join(remove_duplicate_emoji)


def split_emoji(string):
    em_split_emoji = emoji.get_emoji_regexp().split(string)
    em_split_whitespace = [substr.split() for substr in em_split_emoji]
    em_split = functools.reduce(operator.concat, em_split_whitespace)
    em_split = ' '.join(em_split)
    return em_split


def preprocessing_vietnamese(text):

    intab_l = list("ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ")
    ascii_lowercase = list(string.ascii_lowercase)
    digits = list(string.digits)
    punctuation = list(string.punctuation)
    whitespace = list(' ')
    digits_dict = {'0':'không','1':'một','2':'hai','3':'ba','4':'bốn','5':'năm','6':'sáu','7':'bảy','8':'tám','9':'chín'}

    emoticon = [x for x in EMOTICONS_EMO]
    emoji = [x for x in UNICODE_EMOJI]

    # dấu câu
    accept_char = intab_l + emoji + ascii_lowercase + whitespace

    # remove hastag, mention, url
    text = re.sub(r'<[^<]+?>','',text)
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    text = re.sub('http[s]?://\S+', '', text)

    # convert text to lowercase
    text = text.lower()

    # # convert digits to text
    # for k, v in digits_dict.items():
    #     value = ' '+v+' '
    #     text = text.replace(k, value)

    # map correct font
    for k,v in font_correct_dict.items():
        text = text.replace(k,v)

    # remove characters if not in vietnamese alphabet
    text = [letter if letter in accept_char else ' ' for letter in text]
    text = ''.join(text)

    # # mapping dấu câu thành dấu chấm câu
    # text = re.sub(r'[,!?;-]+', '.', text)
    # text = text.replace('.','. ')
    # text = text.replace(' .','.')
    # text = re.sub(r'([.])\1+', r'\1', text)

    # chuẩn hóa các từ elongated
    text = re.sub(r'([a-z]+?)\1+', r'\1\1', text)

    # loại bỏ các từ lặp lại liên tiếp
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

    # loại bỏ duplicate emoji
    text = split_emoji(text)
    text = remove_duplicate_emoji(text)

    # tạo khoảng cách đầu và cuối câu
    text_len = len(text) + 4
    text = text.center(text_len)
    
    # dịch các từ tiếng anh thông dụng sang tiếng việt
    for k, v in en2vi_dict.items():
        key = ' '+k+' '
        value = ' '+v+' '
        text = text.replace(key, value)

    # loại bỏ extra whilespace
    text = re.sub(r'^\s+$|\s+$', ' ', text).strip()

    # loại bỏ tab
    text = re.sub(r'^\s*|\s\s*', ' ', text).strip()

    # tokenize
    # rdrsegmenter = VnCoreNLP("/content/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
    # sentences = rdrsegmenter.tokenize(text) 
    # for sentence in sentences:
    #     text = " ".join(sentence)
    # text = ViTokenizer.tokenize(text)

    return text

In [104]:
df['preprocess'] = df['content'].map(preprocessing_vietnamese)

## Autocorrect word with difflib library and tokenize with pyvi

In [None]:
import pandas as pd
import numpy as np
import difflib

def count_ngrams(series: pd.Series, n: int) -> pd.Series:
    ngrams = series.copy().str.split(' ').explode()
    for i in range(1, n):
        ngrams += ' ' + ngrams.groupby(level=0).shift(-i)
        ngrams = ngrams.dropna()
    return ngrams.value_counts() 

top_percentile = 95     # what word covers 95%
bi_grams = count_ngrams(df['preprocess'], n=2)
df_bi_grams = pd.DataFrame(bi_grams)
top_bi_grams = np.floor(np.percentile(df_bi_grams, top_percentile))

word_top_bi_grams = df_bi_grams[df_bi_grams['preprocess'] >= top_bi_grams] # get top correct word with percentile 95% in dataset
word_bottom_bi_grams = df_bi_grams[df_bi_grams['preprocess'] < top_bi_grams] # bottom word remaining

list_word_top_bi_grams = word_top_bi_grams.index.tolist()
list_word_bottom_bi_grams = word_bottom_bi_grams.index.tolist()

correct_word_dict = {}
for elem in list_word_bottom_bi_grams:
    closest = difflib.get_close_matches(elem, list_word_top_bi_grams, cutoff=0.95) # map list top word to list bottom word with 95% probability
    if closest:
        correct_word_dict[elem] = closest[0]

print(correct_word_dict)

In [None]:
def autocorrect_and_tokenize(text):

    # tạo khoảng cách đầu và cuối câu
    text_len = len(text) + 4
    text = text.center(text_len)

    # auto correct
    for k, v in correct_word_dict.items():
        key = ' '+k+' '
        value = ' '+v+' '
        text = text.replace(key, value)

    # loại bỏ extra whilespace
    text = re.sub(r'^\s+$|\s+$', ' ', text).strip()

    # loại bỏ tab
    text = re.sub(r'^\s*|\s\s*', ' ', text).strip()

    # tokenize
    text = ViTokenizer.tokenize(text)

    return text

In [None]:
df['preprocess'] = df['preprocess'].apply(autocorrect_and_tokenize)
df

In [None]:
# Gán nhãn dự liệu sentiment dựa vào rating
sentiment = {1: 0,
            2: 0,
            3: 0,
            4: 1,
            5: 1}

df["target"] = df["score"].map(sentiment)
df = df[['preprocess','target']].dropna()
df = df.query('preprocess != ""')


df['num_word'] = [len(sentence.split()) for sentence in df['preprocess']]
df

# Preparing train, test, val data

In [None]:
from sklearn.model_selection import train_test_split

train_df , remaining = train_test_split(df,
                                        train_size=0.8,
                                        random_state=42,
                                        stratify=df.target.values)

val_df, test_df = train_test_split(remaining,
                                   train_size=0.5,
                                   random_state=42,
                                   stratify=remaining.target.values)

len(train_df), len(val_df), len(test_df)

In [None]:
train_sentences = train_df["preprocess"].tolist()
val_sentences = val_df["preprocess"].tolist()
test_sentences = test_df["preprocess"].tolist()

# Getting a base model

## Base model Naive Bayes

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create a pipeline
model_0 = Pipeline([
  ("tf-idf", TfidfVectorizer()),
  ("clf", MultinomialNB())
])

# Fit the pipeline to the training data
model_0.fit(X=train_sentences, 
            y=train_df['target']);

In [None]:
model_0_predict = model_0.predict(val_sentences)

model_0_results = calculate_results(y_true=val_df['target'],
                                     y_pred=model_0_predict)
model_0_results

# Preparing data for deep learning model


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

## Preparing text vectorizer and embedding for sentence level

In [None]:
words = []

for word in df['preprocess']:
    words.append(word)

max_tokens = len(set(words))

In [None]:
# Create text vectorizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens=max_tokens, # number of words in vocabulary
                                    standardize="lower", 
                                    split="whitespace",
                                    output_sequence_length=128) # desired output length of vectorized sequences

In [None]:
# Adapt text vectorizer to training sentences
text_vectorizer.adapt(train_sentences)

In [None]:
vocab = text_vectorizer.get_vocabulary()
len(vocab)

In [None]:
# Create token embedding layer
token_embed = layers.Embedding(input_dim=len(vocab), # length of vocabulary
                               output_dim=128, # Note: different embedding sizes result in drastically different numbers of parameters to train
                               # Use masking to handle variable sequence lengths (save space)
                               mask_zero=True,
                               name="token_embedding") 

### Preparing sentence

In [None]:
# How long is each sentence on average?
sent_lens = [len(sentence.split()) for sentence in train_sentences]
avg_sent_len = np.mean(sent_lens)
avg_sent_len # return average sentence length (in tokens)

In [None]:
# What's the distribution look like?
import matplotlib.pyplot as plt
plt.hist(sent_lens, bins=10);

In [None]:
# How long of a sentence covers 95% of the lengths?
output_seq_len = int(np.percentile(sent_lens, 95))
output_seq_len

### Preparing labels one hot

In [None]:
import tensorflow as tf

train_labels_one_hot = tf.one_hot(train_df['target'].to_numpy(), depth=len(set(df['target'])))
val_labels_one_hot = tf.one_hot(val_df['target'].to_numpy(), depth=len(set(df['target'])))
test_labels_one_hot = tf.one_hot(test_df['target'].to_numpy(), depth=len(set(df['target'])))

train_labels_one_hot

In [None]:
# Turn our data into TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

train_dataset

In [None]:
# Take the TensorSliceDataset's and turn them into prefetched batches
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset

## Model_1: Sentence level Conv1D

In [None]:
# Create 1D convolutional model to process sequences
inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vectors = text_vectorizer(inputs) # vectorize text inputs
token_embeddings = token_embed(text_vectors) # create embedding
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings)
x = layers.MaxPooling1D(pool_size=2, strides=2, padding='same')(x)
x = layers.Dropout(0.2)(x)
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(x)
x = layers.MaxPooling1D(pool_size=2, strides=2, padding='same')(x)
x = layers.Dropout(0.2)(x)
x = layers.GlobalAveragePooling1D()(x) # condense the output of our feature vector
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(len(set(df['target'])), activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs)

# Compile
model_1.compile(loss="categorical_crossentropy", # if your labels are integer form (not one hot) use sparse_categorical_crossentropy
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_1.summary()

In [None]:
# Fit the model
model_1_history = model_1.fit(train_dataset,
                              epochs=100,
                              validation_data=valid_dataset,
                              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)]) # only validate on 10% of batches

model_1.evaluate(valid_dataset)

In [None]:
# Make predictions (our model outputs prediction probabilities for each class)
model_1_pred_probs = model_1.predict(test_dataset)

# Convert pred probs to classes
model_1_preds = tf.argmax(model_1_pred_probs, axis=1)

# Calculate model_1 results
model_1_results = calculate_results(y_true=test_df['target'],
                                    y_pred=model_1_preds)
model_1_results

## Model_2: Sentence level with LSTM model

In [None]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vectors = text_vectorizer(inputs) # vectorize text inputs
token_embeddings = token_embed(text_vectors) # create embedding
x = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(token_embeddings)
x = layers.LSTM(32)(x)
x = layers.Dropout(0.4)(x)
outputs = layers.Dense(len(set(df['target'])), activation="softmax")(x)
model_2 = tf.keras.Model(inputs, outputs)

# Compile
model_2.compile(loss="categorical_crossentropy", # if your labels are integer form (not one hot) use sparse_categorical_crossentropy
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_2.summary()

In [None]:
# Fit the model
model_2_history = model_2.fit(train_dataset,
                              epochs=100,
                              validation_data=valid_dataset,
                              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)])
model_2.evaluate(valid_dataset)

In [None]:
# Make predictions (our model outputs prediction probabilities for each class)
model_2_pred_probs = model_2.predict(test_dataset)

# Convert pred probs to classes
model_2_preds = tf.argmax(model_2_pred_probs, axis=1)

# Calculate model_1 results
model_2_results = calculate_results(y_true=test_df['target'],
                                    y_pred=model_2_preds)
model_2_results

## Model_3: Sentence level with transformer

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
vocab_size = len(vocab)  # Only consider the top 20k words
maxlen = 128  # Only consider the first 200 words of each movie review
embed_dim = 32  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 16  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.4)(x)
outputs = layers.Dense(len(set(df['target'])), activation="softmax")(x)

model_3 = tf.keras.Model(inputs=inputs, outputs=outputs)

# Compile
model_3.compile(loss="categorical_crossentropy", # if your labels are integer form (not one hot) use sparse_categorical_crossentropy
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_3.summary()

In [None]:
# Fit the model
model_3_history = model_3.fit(text_vectorizer(train_sentences),
                              train_labels_one_hot,
                              epochs=100,
                              validation_data=(text_vectorizer(val_sentences), val_labels_one_hot),
                              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)])
model_3.evaluate(text_vectorizer(val_sentences), val_labels_one_hot)

In [None]:
# Make predictions (our model outputs prediction probabilities for each class)
model_3_pred_probs = model_3.predict(text_vectorizer(test_sentences))


# Convert pred probs to classes
model_3_preds = tf.argmax(model_3_pred_probs, axis=1)

# Calculate model_1 results
model_3_results = calculate_results(y_true=test_df['target'],
                                    y_pred=model_3_preds)
model_3_results

## Model_4: Chars level with Conv1D

In [None]:
# Make function to split sentences into characters
def split_chars(text):
  return " ".join(list(text))

In [None]:
# Split sequence-level data splits into character-level data splits
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]
print(train_chars[0])

In [None]:
# What's the average character length?
char_lens = [len(sentence) for sentence in train_sentences]
mean_char_len = np.mean(char_lens)
mean_char_len

In [None]:
# Check the distribution of our sequences at character-level
import matplotlib.pyplot as plt
plt.hist(char_lens, bins=7);

In [None]:
# Find what character length covers 95% of sequences
output_seq_char_len = int(np.percentile(char_lens, 95))
output_seq_char_len

In [None]:
# Create char-level token vectorizer instance
char_vectorizer = TextVectorization(max_tokens=10000,  
                                    output_sequence_length=output_seq_char_len,
                                    standardize="lower_and_strip_punctuation",
                                    name="char_vectorizer")

# Adapt character vectorizer to training characters
char_vectorizer.adapt(train_chars)

In [None]:
# Check character vocabulary characteristics
char_vocab = char_vectorizer.get_vocabulary()
print(f"Number of different characters in character vocab: {len(char_vocab)}")
print(f"5 most common characters: {char_vocab[:5]}")
print(f"5 least common characters: {char_vocab[-5:]}")

In [None]:
import random

# Test out character vectorizer
random_train_chars = random.choice(train_chars)
print(f"Charified text:\n{random_train_chars}")
print(f"\nLength of chars: {len(random_train_chars.split())}")
vectorized_chars = char_vectorizer([random_train_chars])
print(f"\nVectorized chars:\n{vectorized_chars}")
print(f"\nLength of vectorized chars: {len(vectorized_chars[0])}")

# Create char embedding layer
char_embed = layers.Embedding(input_dim=1000, # number of different characters
                              output_dim=25, # embedding dimension of each character (same as Figure 1 in https://arxiv.org/pdf/1612.05251.pdf)
                              mask_zero=False, # don't use masks (this messes up model_5 if set to True)
                              name="char_embed")

# Test out character embedding layer
print(f"Charified text (before vectorization and embedding):\n{random_train_chars}\n")
char_embed_example = char_embed(char_vectorizer([random_train_chars]))
print(f"Embedded chars (after vectorization and embedding):\n{char_embed_example}\n")
print(f"Character embedding shape: {char_embed_example.shape}")

In [None]:
# Make Conv1D on chars only
inputs = layers.Input(shape=(1,), dtype="string")
char_vectors = char_vectorizer(inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(len(set(df['target'])), activation="softmax")(x)
model_4 = tf.keras.Model(inputs=inputs,
                         outputs=outputs,
                         name="model_3_conv1D_char_embedding")

# Compile model
model_4.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_4.summary(0)

In [None]:
# Create char datasets
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)

train_char_dataset

In [None]:
# Fit the model
model_4_history = model_4.fit(train_dataset,
                              epochs=100,
                              validation_data=valid_dataset,
                              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)])
model_4.evaluate(valid_dataset)

In [None]:
# Make predictions (our model outputs prediction probabilities for each class)
model_4_pred_probs = model_4.predict(test_dataset)

# Convert pred probs to classes
model_4_preds = tf.argmax(model_4_pred_probs, axis=1)

# Calculate model_1 results
model_4_results = calculate_results(y_true=test_df['target'],
                                    y_pred=model_4_preds)
model_4_results

## Model_5: Combined sentences model and chars level

In [None]:
# 1. Setup token inputs/model
token_inputs = layers.Input(shape=(1,), dtype=tf.string, name="token_input")
token_embeddings = token_embed(text_vectorizer(token_inputs))
x = layers.GlobalAveragePooling1D()(token_embeddings)
x = layers.Dense(128, activation="relu")(x)
token_output = layers.Dropout(0.4)(x)
token_model = tf.keras.Model(inputs=token_inputs,
                             outputs=token_output)
token_model.summary()
# 2. Setup char inputs/model
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input")
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(char_embeddings)
x = layers.LSTM(16)(x)
char_bi_lstm = layers.Dropout(0.4)(x) # bi-LSTM shown in Figure 1 of https://arxiv.org/pdf/1612.05251.pdf
char_model = tf.keras.Model(inputs=char_inputs,
                            outputs=char_bi_lstm)
print('\n')
char_model.summary()
# 3. Concatenate token and char inputs (create hybrid token embedding)
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, 
                                                                  char_model.output])

# 4. Create output layers - addition of dropout discussed in 4.2 of https://arxiv.org/pdf/1612.05251.pdf
combined_dropout = layers.Dropout(0.5)(token_char_concat)
combined_dense = layers.Dense(200, activation="relu")(combined_dropout) # slightly different to Figure 1 due to different shapes of token/char embedding layers
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(len(set(df['target'])), activation="softmax")(final_dropout)

# 5. Construct model with char and token inputs
model_5 = tf.keras.Model(inputs=[token_model.input, char_model.input],
                         outputs=output_layer,
                         name="model_4_token_and_char_embeddings")
print('\n')
model_5.summary()

In [None]:
# Plot hybrid token and character model
from tensorflow.keras.utils import plot_model
plot_model(model_5)

In [None]:
# Compile token char model
model_5.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(), # section 4.2 of https://arxiv.org/pdf/1612.05251.pdf mentions using SGD but we'll stick with Adam
                metrics=["accuracy"])

In [None]:
# Combine chars and tokens into a dataset
train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) # make data
train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # make labels
train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) # combine data and labels

# Prefetch and batch train data
train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) 

# Repeat same steps validation data
val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars))
val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels))

# Prefetch and batch validation data
val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# Repeat same steps test data
test_char_token_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars))
test_char_token_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_char_token_dataset = tf.data.Dataset.zip((test_char_token_data, test_char_token_labels))

# Prefetch and batch validation data
test_char_token_dataset = test_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
# Fit the model on tokens and chars
model_5_history = model_5.fit(train_char_token_dataset, # train on dataset of token and characters
                              epochs=100,
                              validation_data=val_char_token_dataset,
                              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)])
model_5.evaluate(val_char_token_dataset)

In [None]:
# Make predictions (our model outputs prediction probabilities for each class)
model_5_pred_probs = model_5.predict(test_char_token_dataset)

# Convert pred probs to classes
model_5_preds = tf.argmax(model_5_pred_probs, axis=1)

# Calculate model_1 results
model_5_results = calculate_results(y_true=test_df['target'],
                                    y_pred=model_5_preds)
model_5_results

## Model_6: Combined token models + chars model + one_hot number of token 

In [None]:
# Check the coverage of a "total_lines" value of 20
len_num = np.percentile(train_df.num_word, 98) # a value of 20 covers 98% of samples
len_num

In [None]:
max_num = max(df['num_word'])

In [None]:
# Use TensorFlow to create one-hot-encoded tensors of our "total_lines" column 
train_num_word_one_hot = tf.one_hot(train_df["num_word"].to_numpy(), depth=max_num)
val_num_word_one_hot = tf.one_hot(val_df["num_word"].to_numpy(), depth=max_num)
test_num_word_one_hot = tf.one_hot(test_df["num_word"].to_numpy(), depth=max_num)

# Check shape and samples of total lines one-hot tensor
train_num_word_one_hot.shape, train_num_word_one_hot[:10]

In [None]:
# 1. Setup token inputs/model
token_inputs = layers.Input(shape=(1,), dtype=tf.string, name="token_input")
token_embeddings = token_embed(text_vectorizer(token_inputs))
x = layers.Conv1D(64, kernel_size=2, padding="same", activation="relu")(token_embeddings)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(128, activation="relu")(x)
token_output = layers.Dropout(0.2)(x)
token_model = tf.keras.Model(inputs=token_inputs,
                             outputs=token_output)

# 2. Setup char inputs/model
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input")
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(char_embeddings)
x = layers.LSTM(16)(x)
char_bi_lstm = layers.Dropout(0.2)(x) # bi-LSTM shown in Figure 1 of https://arxiv.org/pdf/1612.05251.pdf
char_model = tf.keras.Model(inputs=char_inputs,
                            outputs=char_bi_lstm)

# 3. Setup num word inputs/model
num_word_inputs = layers.Input(shape=(max_num,), dtype=tf.int32, name="total_lines_input")
y = layers.Dense(128, activation="relu")(num_word_inputs)
y = layers.Dropout(0.2)(y)
num_word_model = tf.keras.Model(inputs=num_word_inputs,
                                  outputs=y)



# 4. Concatenate token and char inputs (create hybrid token embedding)
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, 
                                                                  char_model.output])

z = layers.Dense(128, activation="relu")(token_char_concat)
z = layers.Dropout(0.2)(z)


# 4. Create output layers - addition of dropout discussed in 4.2 of https://arxiv.org/pdf/1612.05251.pdf
z = layers.Concatenate(name="token_char_positional_embedding")([num_word_model.output,
                                                                z])

# 7. Create output layer
output_layer = layers.Dense(len(set(df['target'])), activation="softmax", name="output_layer")(z)

# 8. Put together model
model_6 = tf.keras.Model(inputs=[num_word_model.input,
                                 token_model.input, 
                                 char_model.input],
                         outputs=output_layer)

model_6.summary()

In [None]:
# # Plot hybrid token and character model
# import matplotlib.pyplot as plt
# from tensorflow.keras.utils import plot_model

# plot_model(model_6)

In [None]:
# Compile token char model
model_6.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(), # section 4.2 of https://arxiv.org/pdf/1612.05251.pdf mentions using SGD but we'll stick with Adam
                metrics=["accuracy"])

In [None]:
# Create training and validation datasets (all four kinds of inputs)
train_pos_char_token_data = tf.data.Dataset.from_tensor_slices((train_num_word_one_hot, # total lines
                                                                train_sentences, # train tokens
                                                                train_chars)) # train chars
train_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # train labels
train_pos_char_token_dataset = tf.data.Dataset.zip((train_pos_char_token_data, train_pos_char_token_labels)) # combine data and labels
train_pos_char_token_dataset = train_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # turn into batches and prefetch appropriately

# Validation dataset
val_pos_char_token_data = tf.data.Dataset.from_tensor_slices((val_num_word_one_hot,
                                                              val_sentences,
                                                              val_chars))
val_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_pos_char_token_dataset = tf.data.Dataset.zip((val_pos_char_token_data, val_pos_char_token_labels))
val_pos_char_token_dataset = val_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # turn into batches and prefetch appropriately

# Test dataset
test_pos_char_token_data = tf.data.Dataset.from_tensor_slices((test_num_word_one_hot,
                                                              test_sentences,
                                                              test_chars))
test_pos_char_token_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_pos_char_token_dataset = tf.data.Dataset.zip((test_pos_char_token_data, test_pos_char_token_labels))
test_pos_char_token_dataset = test_pos_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # turn into batches and prefetch appropriately

# Check input shapes
train_pos_char_token_dataset, val_pos_char_token_dataset, test_pos_char_token_dataset

In [None]:
# Fit the token, char and positional embedding model
history_model_6 = model_6.fit(train_pos_char_token_dataset,
                              epochs=100,
                              validation_data=val_pos_char_token_dataset,
                              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)])
model_6.evaluate(test_pos_char_token_dataset)

In [None]:
# Make predictions (our model outputs prediction probabilities for each class)
model_6_pred_probs = model_6.predict(test_pos_char_token_dataset)

# Convert pred probs to classes
model_6_preds = tf.argmax(model_6_pred_probs, axis=1)

# Calculate model_1 results
model_6_results = calculate_results(y_true=test_df['target'],
                                    y_pred=model_6_preds)
model_6_results

# Compare results

In [None]:
# Combine model results into a DataFrame
all_model_results = pd.DataFrame({"baseline": model_0_results,
                                  "token level Conv1D": model_1_results,
                                  "token level with LSTM": model_2_results,
                                  "token level with transformer": model_3_results,
                                  "chars level with Conv1D": model_4_results,
                                  "combined token level and chars level": model_5_results,
                                  "combined token level, chars level and number token in one_hot": model_6_results})
all_model_results = all_model_results.transpose()
all_model_results