In [None]:
# Import relevant libraries 

import re
import random
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model

from sklearn.model_selection import train_test_split
from sklearn.metrics import top_k_accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix, mean_absolute_error
#from scikitplot.metrics import plot_roc

In [None]:
warnings.filterwarnings("ignore")

# https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html

#plt.style.use('seaborn-v0_8-colorblind')
#plt.style.use('default')
#plt.style.use('seaborn-v0_8-dark-palette')
#plt.style.use('seaborn-pastel')
plt.style.use('default')

In [None]:
#Define configs

class CFG:
    SEED = 768
    BATCH_SIZE = 32
    EPOCHS = 10

In [None]:
def seed_everything(seed=CFG.SEED):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(CFG.SEED)

In [None]:
# Define paths
DATASET_PATH = "/kaggle/input/nlp-sentiments-analysis"
TRAIN_CSV = '/kaggle/input/nlp-sentiments-analysis/train.csv'
TEST_CSV = '/kaggle/input/nlp-sentiments-analysis/test.csv'
SAMPLE_SUB_CSV = '/kaggle/input/nlp-sentiments-analysis/sample_submission.csv'

# Dataset Exploration

In [None]:
# Load the csv files
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
submission_df = pd.read_csv(SAMPLE_SUB_CSV) 

In [None]:
train_df.info()

In [None]:
train_df.head(5)

In [None]:
# view random selected data

def view_samples(df, count=5):
    idx = random.sample(train_df.index.to_list(), count)
    print('=========================================\n')
    for _ in idx:
        print(f'id:\t{df.Id[_]}\n')
        print(f'Review:\n{df.Review[_]}\n')
        print(f'Rating:\n{df.Rating[_]}')
        print('=========================================\n')

In [None]:
# view randomly selected data

view_samples(train_df, count=5)

In [None]:
train_distribution = train_df['Rating'].value_counts().sort_values()
train_distribution

In [None]:
# View Train Rating Distribution
plt.figure(figsize=(15, 8))
plt.title('Train Rating Distribution', fontsize=20)

train_distribution = train_df['Rating'].value_counts().sort_values()
sns.barplot(x=list(train_distribution.keys()),
            y=train_distribution.values);

sns.despine();

In [None]:
# Get the lengths of each review
train_df['review_length'] = [len(_) for _ in train_df.Review]

# Get the number of tokens per review 
train_df['token_count'] = [len(_.split()) for _ in train_df.Review]

In [None]:
train_df.head(5)

In [None]:
print(f"{train_df['review_length'].describe()}")

In [None]:
print(f"{train_df['token_count'].describe()}")

In [None]:
# Review lengths of Ratings

fig, (ax1, ax2) = plt.subplots(2, figsize=(14, 18))

# Set the spacing between subplots
fig.tight_layout(pad=6.0)

# Plot Range of Review Lengths per Rating
ax1.set_title('Review Lengths per Rating', fontsize=20)
sns.boxplot(data=train_df, y='review_length', x='Rating',
            ax=ax1)
ax1.set_xlabel('Rating', fontsize=14)
ax1.set_ylabel('review_length', fontsize=14)
sns.despine();

# Plot Range of Token Counts per Rating
ax2.set_title('Token Counts per Rating', fontsize=20)
sns.boxplot(data=train_df, y='token_count', x='Rating',
            ax=ax2);
ax2.set_xlabel('Rating', fontsize=14)
ax2.set_ylabel('token_count', fontsize=14)
sns.despine();

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(14, 10))

# Set the spacing between subplots
fig.tight_layout(pad=6.0)

# Generate Train Rating Histogram
ax1.set_title('Train Review Length Histogram', fontsize=20)
sns.histplot(data=train_df, x='review_length', bins=50,
            ax=ax1)
ax1.set_xlabel('review_length', fontsize=14)
ax1.set_ylabel('Count', fontsize=14)
sns.despine();

# Generate Train Token Count Histogram
ax2.set_title('Train Token Count Histogram', fontsize=20)
sns.histplot(data=train_df, x='token_count', bins=50,
            ax=ax2)
ax2.set_xlabel('token_count', fontsize=14)
ax2.set_ylabel('Count', fontsize=14)
sns.despine();

# Data Preprocessing

In [None]:
# Label encode ratings
train_df["rating_encoded"] = train_df['Rating'] - 1

In [None]:
train_df.head(5)

In [None]:
# get indices of training and test data sets. 
train_idx, val_idx, _, _ = train_test_split(
    train_df.index, train_df.Rating, 
    test_size=0.2, stratify=train_df.Rating,
    random_state=CFG.SEED
)

In [None]:
val_idx

In [None]:
train_new_df = train_df.iloc[train_idx].reset_index(drop= True)
val_df = train_df.iloc[val_idx].reset_index(drop = True)

In [None]:
train_new_df

In [None]:
# Train and Test Rating Distribution 
fig, (ax1, ax2) = plt.subplots(2, figsize=(14, 10))

# Set the spacing between subplots
fig.tight_layout(pad=6.0)

# Plot New Train Ratings Distribution
ax1.set_title('New Train Ratings Distribution', fontsize=20)
train_new_distribution = train_new_df['Rating'].value_counts().sort_values()
sns.barplot(x=train_new_distribution.values,
            y=list(train_new_distribution.keys()),
            orient="h",
            ax=ax1)
sns.despine();

# Plot Validation Ratings Distribution
ax2.set_title('Validation Ratings Distribution', fontsize=20)
val_distribution = val_df['Rating'].value_counts().sort_values()
sns.barplot(x=val_distribution.values,
            y=list(val_distribution.keys()),
            orient="h",
            ax=ax2);
sns.despine();

------------------------------------------------------------------------

# Build Input Data Pipeline with tf.data API

we'll use the tf.data API to build input data pipelines for training a model and conducting model inference. In order to achieve this, we'll preprocess the reviews by removing any artifacts in the texts such as emojis, non-ascii characters and replacing numbers with another character. The preprocessed texts will be used to construct the pipelines along with the one-hot encoded ratings.

For more information on the tf.data API and loading data from generator, follow these links:


* tf.data: Build TensorFlow input pipelines - https://www.tensorflow.org/guide/data
* Better performance with the tf.data API - https://www.tensorflow.org/guide/data_performance
* Using generators with tf.data API -https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator

### Define Text Preprocessor

In [None]:
def text_preprocessor(text):
    
    # -----------------------------------------------------
    # Source: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    # -----------------------------------------------------
    non_ascii_pattern = re.compile(r"[^\x00-\x7F]+", flags=re.UNICODE)
    digit_pattern = re.compile('[0-9]', flags=re.UNICODE)
    
    # -----------------------------------------------------
    # Source: https://stackoverflow.com/questions/21932615/regular-expression-for-remove-link
    link_pattern = re.compile('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', flags=re.UNICODE)
    # -----------------------------------------------------
    
    # Remove emojis
    preprocessed_text = emoji_pattern.sub(r'', text)
    # Remove non-ascii characters
    preprocessed_text = non_ascii_pattern.sub(r'', preprocessed_text)
    # Replace numbers with '#' sign
    preprocessed_text = digit_pattern.sub(r'#', preprocessed_text)
    # Remove web links 
    preprocessed_text = link_pattern.sub(r'', preprocessed_text)
    
    return preprocessed_text

In [None]:
# Generate Input Data pipelines

def encode_labels(labels, label_depth=5):
    return tf.one_hot(labels, depth=label_depth).numpy()

def create_pipeline(df, preprocessor, batch_size=32, shuffle=False, cache=None, prefetch=False):
    '''
    Generates an input pipeline using the tf.data API given a Pandas DataFrame and image loading function.
    
    @params
        - df: (pd.DataFrame) -> DataFrame containing texts and labels
        - preprocessor (function) -> preprocessor used to preprocess texts
        - batch_size: (int) -> size for batched (default=32) 
        - shuffle: (bool) -> condition for data shuffling, data is shuffled when True (default=False)
        - cache: (str) -> cache path for caching data, data is not cached when None (default=None)
        - prefetch: (bool) -> condition for prefeching data, data is prefetched when True (default=False)
        
    @returns
        - dataset: (tf.data.Dataset) -> dataset input pipeline used to train a TensorFlow model
    '''
    # Get image paths and labels from DataFrame
    reviews = df['Review'].apply(preprocessor).to_numpy().astype(str)
    ratings = encode_labels(df['rating_encoded'].to_numpy().astype(np.float32))
    AUTOTUNE = tf.data.AUTOTUNE
    
    # Create dataset with raw data from DataFrame
    ds = tf.data.Dataset.from_tensor_slices((reviews, ratings))
    
    # Apply shuffling based on condition
    if shuffle:
        ds = ds.shuffle(buffer_size=1000)
        
    # Apply batching
    ds = ds.batch(batch_size)
    
    # Apply caching based on condition
    # Note: Use cache in memory (cache='') if the data is small enough to fit in memory!!!
    if cache != None:
        ds = ds.cache(cache)
    
    # Apply prefetching based on condition
    # Note: This will result in memory trade-offs
    if prefetch:
        ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    # Return the dataset
    return ds

In [None]:
# Create train input data pipeline
train_ds = create_pipeline(
    train_new_df, text_preprocessor, 
    batch_size=CFG.BATCH_SIZE, 
    shuffle=False, prefetch=True
)

# Create validation input data pipeline
val_ds = create_pipeline(
    val_df, text_preprocessor,
    batch_size=CFG.BATCH_SIZE, 
    shuffle=False, prefetch=False
)

In [None]:
# View string representation of datasets
print('========================================')
print('Train Input Data Pipeline:\n\n', train_ds)
print('========================================')
print('Validation Input Data Pipeline:\n\n', val_ds)
print('========================================')

# Baseline Model : Universal Sentence Encoder Model

In [None]:
# Here's a function to get any model/preprocessor from tensorflow hub
def get_tfhub_model(model_link, model_name, model_trainable=False):
    return hub.KerasLayer(model_link,
                          trainable=model_trainable,
                          name=model_name)

### Get Universal Sentence Encoder

In [None]:
# Get Universal Sentence Encoder here
# -----------------------------------
# Note: We'll use the version from Kaggle's Models page instead.
#       Check it out here: 
#       (https://www.kaggle.com/models/google/universal-sentence-encoder)
# -----------------------------------
encoder_link = 'https://kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2'
# encoder_link = 'https://tfhub.dev/google/universal-sentence-encoder/4'

encoder_name = 'universal_sentence_encoder'
encoder_trainable=False # set trainable to False for inference-only 

encoder = get_tfhub_model(encoder_link, encoder_name, model_trainable=encoder_trainable)

#### Build Model

In [None]:
def build_baseline_model(num_classes=5):
    # Define kernel initializer & input layer
    initializer = tf.keras.initializers.HeNormal(seed=CFG.SEED)
    review_input = layers.Input(shape=[], dtype=tf.string, name='review_text_input')
    
    # Generate Embeddings
    review_embedding = encoder(review_input)
    
    # Feed Embeddings to a Bidirectional LSTM
    expand_layer = layers.Lambda(lambda embed: tf.expand_dims(embed, axis=1))(review_embedding)
    bi_lstm = layers.Bidirectional(layers.LSTM(128, kernel_initializer=initializer), 
                                   name='bidirection_lstm')(expand_layer)
    
    # Feed LSTM output to classification head
    dropout_layer = layers.Dropout(0.25)(bi_lstm)
    dense_layer = layers.Dense(64, activation='relu', kernel_initializer=initializer)(dropout_layer)
    output_layer = layers.Dense(num_classes, activation='softmax', 
                                kernel_initializer=initializer, 
                                name='output_layer')(dense_layer)
    
    return tf.keras.Model(inputs=[review_input], 
                          outputs=[output_layer], 
                          name='use_model')

In [None]:
# Build model
model = build_baseline_model()

# View summary of model
model.summary()

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Ensure input is a list of strings
sentences = ["This is a test sentence.", "Another example sentence."]
embeddings = embed(sentences)  # Correct input format

print(embeddings.shape)  # Should output: (2, 512)