# Sarcasm classification [tensorflow]
* Binary classification of tweets being sarcastic or normal.
* Reference notebook: <https://www.kaggle.com/code/madz2000/sarcasm-detection-with-glove-word2vec-83-accuracy/notebook#Introduction-to-GloVe>
* Dataset: <https://www.kaggle.com/code/madz2000/sarcasm-detection-with-glove-word2vec-83-accuracy/input>

## Install necessary libraries

In [None]:
!pip install nltk

## Import libraries

In [None]:
import os
import string
from typing import Tuple

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from keras.optimizers import Adam
from keras.models import Sequential
from keras.utils import pad_sequences
from keras.preprocessing import text
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Embedding, LSTM, Bidirectional

import giskard
from giskard import Dataset, Model

## Define constants

In [None]:
# Constants.
TEXT_COLUMN_NAME = "headline"
TARGET_COLUMN_NAME = "is_sarcastic"

STOPWORDS = stopwords.words('english')

MAX_TOKENS = 25000
EMBEDDING_DIM = 200
MAX_SEQUENCE_LENGTH = 200

# Paths.
DATA_DIR = os.path.join(".", "datasets", "sarcasm_classification_dataset")
EMBEDDING_FILE = os.path.join(DATA_DIR, "glove_200d.txt")

## Load data

In [None]:
def load_data(**kwargs) -> pd.DataFrame:
    """Load data."""
    print(f"Loading data...")

    df = pd.read_json(os.path.join(DATA_DIR, "Sarcasm_Headlines_Dataset_v2.json"), lines=True, **kwargs)
    df = df.drop(columns="article_link")

    print(f"Finished loading data! \nShape: {df.shape} \nColumns: {df.columns.tolist()}")

    return df

sarcasm_df = load_data(nrows=2000)

## Define text preprocessing logic

In [None]:
def clean_text(df: pd.DataFrame) -> np.ndarray:
    """Perform text-data cleaning: punctuation and stop words removal."""

    # Remove punctuation.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(
        lambda sentence: sentence.translate(str.maketrans('', '', string.punctuation)))

    # Remove stop words.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(
        lambda sentence: ' '.join([_word for _word in sentence.split() if _word.lower() not in STOPWORDS]))

    return df[TEXT_COLUMN_NAME]

## Train-test split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(sarcasm_df[[TEXT_COLUMN_NAME]], sarcasm_df[TARGET_COLUMN_NAME], random_state=0)

## Wrap data with giskard

In [None]:
raw_data = pd.concat([X_test.copy(), Y_test.copy()], axis=1)
wrapped_data = Dataset(raw_data, name="sarcasm", target=TARGET_COLUMN_NAME, column_types={TEXT_COLUMN_NAME: "text"})

## Preprocess text

In [None]:
print(f"Cleaning train data...")
X_train = clean_text(X_train)

print(f"Cleaning test data...")
X_test = clean_text(X_test)

print(f"Finished cleaning!")

In [None]:
# Fit tokenizer.
tokenizer = text.Tokenizer(num_words=MAX_TOKENS)
tokenizer.fit_on_texts(X_train)

# Define the number of words' embeddings to store. +1, cause tokens indexing starts with 1.
num_tokens = min(MAX_TOKENS, len(tokenizer.word_index)) + 1

# Tokenize train text.
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(tokenized_train, maxlen=MAX_SEQUENCE_LENGTH)

# Tokenize test text.
tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(tokenized_test, maxlen=MAX_SEQUENCE_LENGTH)

## Define preprocessing function

In [None]:
def preprocessing_function(df: pd.DataFrame) -> np.ndarray:
    """Preprocessing function used by giskard."""
    # Clean text.
    cleaned_text = clean_text(df)

    # Tokenize text.
    tokens = tokenizer.texts_to_sequences(cleaned_text)
    tokens_with_padding = pad_sequences(tokens, maxlen=MAX_SEQUENCE_LENGTH)

    return tokens_with_padding

## Create embeddings matrix

In [None]:
def parse_line(word: str, *arr: list) -> Tuple[str, np.ndarray]:
    """Parse line from the file with embeddings.
    The first value of the line is the word and the rest values are related glove embedding: (<word>, 0.66, 0.23, ...)."""
    return word, np.asarray(arr, dtype='float32')

def init_embeddings_matrix(embeddings_dict: dict) -> np.ndarray:
    """Initialization of the matrix, where each row is a specific embedding vector."""
    stacked_embeddings = np.stack(list(embeddings_dict.values()))
    embeddings_mean, embeddings_std, embeddings_dimension = stacked_embeddings.mean(), stacked_embeddings.std(), stacked_embeddings.shape[1]
    embeddings_matrix = np.random.normal(embeddings_mean, embeddings_std, (num_tokens, embeddings_dimension))

    return embeddings_matrix

def get_embeddings_matrix() -> np.ndarray:
    """Create matrix, where each row is an embedding of a specific word."""
    print(f"Building embeddings matrix...")

    # Load glove embeddings.
    embeddings_dict = dict(parse_line(*line.rstrip().rsplit(' ')) for line in open(EMBEDDING_FILE))

    # Initialization of embeddings matrix.
    embeddings_matrix = init_embeddings_matrix(embeddings_dict)

    # Fill-in embeddings matrix with glove word vectors.
    for word, idx in tokenizer.word_index.items():
        if idx >= num_tokens:
            continue

        embedding_vector = embeddings_dict.get(word, None)

        if embedding_vector is not None:
            embeddings_matrix[idx] = embedding_vector

    print(f"Finished building embedding matrix!")

    return embeddings_matrix

embed_matrix = get_embeddings_matrix()

In [None]:
def init_model(weights) -> Sequential:
    # Define model container.
    _model = Sequential()

    # Embedding layer.
    _model.add(Embedding(num_tokens, output_dim=EMBEDDING_DIM, weights=[weights], input_length=EMBEDDING_DIM, trainable=True))

    # LSTM stage.
    _model.add(Bidirectional(LSTM(units=128 , recurrent_dropout=0.5, dropout=0.5)))

    # Dense stage.
    _model.add(Dense(1, activation='sigmoid'))

    # Build model.
    _model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['acc'])
    return _model

In [None]:
model = init_model(embed_matrix)
model.summary()

In [None]:
# Define hyperparameters.
epochs = 5
batch_size = 128

# Fit model.
_ = model.fit(X_train, Y_train, batch_size=batch_size , validation_data=(X_test, Y_test), epochs=epochs)

In [None]:
train_metric = model.evaluate(X_train, Y_train)[1]
test_metric = model.evaluate(X_test, Y_test)[1]
print(f"Train accuracy: {train_metric}\n"
      f"Test accuracy: {test_metric}")

## Wrap model

In [None]:
wrapped_model = Model(model,
                      model_type="classification",
                      data_preprocessing_function=preprocessing_function,
                      name="sarcasm_classification",
                      feature_names=[TEXT_COLUMN_NAME],
                      classification_threshold=0.5,
                      classification_labels=[1, 0])

## Scan model

In [None]:
wrapped_model.predict(wrapped_data)

In [None]:
scanning_result = giskard.scan(wrapped_model, wrapped_data)

In [None]:
display(scanning_result)