## Import libraries

In [55]:
import os
import string
from typing import Tuple

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from keras.optimizers import Adam
from keras.models import Sequential
from keras.utils import pad_sequences
from keras.preprocessing import text
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Embedding, LSTM, Bidirectional

import giskard
from giskard import Dataset, Model

## Define constants

In [56]:
# Constants.
TEXT_COLUMN_NAME = "headline"
TARGET_COLUMN_NAME = "is_sarcastic"

STOPWORDS = stopwords.words('english')

MAX_TOKENS = 25000
EMBEDDING_DIM = 200
MAX_SEQUENCE_LENGTH = 200

# Paths.
DATA_DIR = os.path.join(".", "datasets", "sarcasm_classification_dataset")
EMBEDDING_FILE = os.path.join(DATA_DIR, "glove_200d.txt")

## Load data

In [57]:
def load_data(**kwargs) -> pd.DataFrame:
    """Load data."""
    print(f"Loading data...")

    df = pd.read_json(os.path.join(DATA_DIR, "Sarcasm_Headlines_Dataset_v2.json"), lines=True, **kwargs)
    df = df.drop(columns="article_link")

    print(f"Finished loading data! \nShape: {df.shape} \nColumns: {df.columns.tolist()}")

    return df

sarcasm_df = load_data(nrows=2000)

Loading data...
Finished loading data! 
Shape: (2000, 2) 
Columns: ['is_sarcastic', 'headline']


## Define text preprocessing logic

In [58]:
def clean_text(df: pd.DataFrame) -> np.ndarray:
    """Perform text-data cleaning: punctuation and stop words removal."""

    # Remove punctuation.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(
        lambda sentence: sentence.translate(str.maketrans('', '', string.punctuation)))

    # Remove stop words.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(
        lambda sentence: ' '.join([_word for _word in sentence.split() if _word.lower() not in STOPWORDS]))

    return df[TEXT_COLUMN_NAME]

## Train-test split

In [70]:
X_train, X_test, Y_train, Y_test = train_test_split(sarcasm_df[[TEXT_COLUMN_NAME]], sarcasm_df[TARGET_COLUMN_NAME], random_state=0)

## Wrap data with giskard

In [71]:
raw_data = pd.concat([X_test.copy(), Y_test.copy()], axis=1)
wrapped_data = Dataset(raw_data, name="sarcasm", target=TARGET_COLUMN_NAME, column_types={TEXT_COLUMN_NAME: "text"})

Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."


## Preprocess text

In [72]:
print(f"Cleaning train data...")
X_train = clean_text(X_train)

print(f"Cleaning test data...")
X_test = clean_text(X_test)

print(f"Finished cleaning!")

Cleaning train data...
Cleaning test data...
Finished cleaning!


In [73]:
# Fit tokenizer.
tokenizer = text.Tokenizer(num_words=MAX_TOKENS)
tokenizer.fit_on_texts(X_train)

# Tokenize train text.
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(tokenized_train, maxlen=MAX_SEQUENCE_LENGTH)

# Tokenize test text.
tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(tokenized_test, maxlen=MAX_SEQUENCE_LENGTH)

In [81]:
num_embeddings = min(MAX_TOKENS, len(tokenizer.word_index)) + 1

## Define preprocessing function

In [74]:
def preprocessing_function(df: pd.DataFrame) -> np.ndarray:
    """Preprocessing function used by giskard."""
    # Clean text.
    cleaned_text = clean_text(df)

    # Tokenize text.
    tokens = tokenizer.texts_to_sequences(cleaned_text)
    tokens_with_padding = pad_sequences(tokens, maxlen=MAX_SEQUENCE_LENGTH)

    return tokens_with_padding

## Create embeddings matrix

In [76]:
def parse_line(word: str, *arr: list) -> Tuple[str, np.ndarray]:
    """Parse line from the file with embeddings.
    The first value of the line is the word and the rest values are related glove embedding: (<word>, 0.66, 0.23, ...)."""
    return word, np.asarray(arr, dtype='float32')

def init_embeddings_matrix(embeddings_dict: dict) -> np.ndarray:
    """Initialization of the matrix, where each row is a specific embedding vector."""
    stacked_embeddings = np.stack(list(embeddings_dict.values()))
    embeddings_mean, embeddings_std, embeddings_dimension = stacked_embeddings.mean(), stacked_embeddings.std(), stacked_embeddings.shape[1]

    embeddings_matrix = np.random.normal(embeddings_mean, embeddings_std, (num_embeddings, embeddings_dimension))

    return embeddings_matrix

def get_embeddings_matrix() -> np.ndarray:
    """Create matrix, where each row is an embedding of a specific word."""
    print(f"Building embeddings matrix...")

    # Load glove embeddings.
    embeddings_dict = dict(parse_line(*line.rstrip().rsplit(' ')) for line in open(EMBEDDING_FILE))

    # Initialization of embeddings matrix.
    embeddings_matrix = init_embeddings_matrix(embeddings_dict)

    # Fill-in embeddings matrix with glove word vectors.
    for word, idx in tokenizer.word_index.items():
        if idx >= MAX_TOKENS:
            continue

        embedding_vector = embeddings_dict.get(word, None)

        if embedding_vector is not None:
            embeddings_matrix[idx] = embedding_vector

    print(f"Finished building embedding matrix!")

    return embeddings_matrix

embed_matrix = get_embeddings_matrix()

Building embeddings matrix...
Finished building embedding matrix!


In [86]:
def init_model(weights) -> Sequential:
    # Define model container.
    _model = Sequential()

    # Embedding layer.
    _model.add(Embedding(num_embeddings, output_dim=EMBEDDING_DIM, weights=[weights], input_length=EMBEDDING_DIM, trainable=True))

    # LSTM stage.
    _model.add(Bidirectional(LSTM(units=128 , recurrent_dropout=0.5, dropout=0.5)))

    # Dense stage.
    _model.add(Dense(1, activation='sigmoid'))

    # Build model.
    _model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['acc'])
    return _model

In [87]:
model = init_model(embed_matrix)
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 200)          1106400   
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              336896    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 1)                 257       
                                                                 
Total params: 1,443,553
Trainable params: 1,443,553
Non-trainable params: 0
_________________________________________________________________


In [88]:
# Define hyperparameters.
epochs = 5
batch_size = 128

# Fit model.
_ = model.fit(X_train, Y_train, batch_size=batch_size , validation_data=(X_test, Y_test), epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [89]:
test_metric = model.evaluate(X_test, Y_test)[1]
print(f"Test accuracy: {test_metric}")

Test accuracy: 0.6980000138282776


## Wrap data and model

In [90]:
wrapped_model = Model(model,
                      model_type="classification",
                      data_preprocessing_function=preprocessing_function,
                      name="sarcasm_classification",
                      feature_names=[TEXT_COLUMN_NAME],
                      classification_threshold=0.5,
                      classification_labels=[1, 0])

## Scan model

In [95]:
wrapped_model.predict(wrapped_data)

Your binary classification model prediction is of the shape (500, 1). 
In Giskard we expect the shape (500, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None


ModelPredictionResults(raw=array([[6.67162687e-02, 9.33283746e-01],
       [3.59704299e-03, 9.96402979e-01],
       [8.12155426e-01, 1.87844574e-01],
       [5.31209679e-03, 9.94687915e-01],
       [7.03685954e-02, 9.29631412e-01],
       [1.60563260e-03, 9.98394370e-01],
       [4.92867175e-03, 9.95071352e-01],
       [9.99187648e-01, 8.12351704e-04],
       [1.76677329e-03, 9.98233199e-01],
       [1.99560076e-02, 9.80044007e-01],
       [1.26275659e-01, 8.73724341e-01],
       [4.45018262e-02, 9.55498159e-01],
       [5.09051792e-02, 9.49094832e-01],
       [5.42985741e-03, 9.94570136e-01],
       [9.88711836e-04, 9.99011278e-01],
       [8.21457896e-03, 9.91785407e-01],
       [3.33183892e-02, 9.66681600e-01],
       [9.97920930e-01, 2.07906961e-03],
       [9.96558070e-01, 3.44192982e-03],
       [1.88112468e-03, 9.98118877e-01],
       [5.53899584e-03, 9.94461000e-01],
       [1.01795723e-03, 9.98982072e-01],
       [1.10409048e-03, 9.98895884e-01],
       [9.41927910e-01, 5.8072

In [96]:
scanning_result = giskard.scan(wrapped_model, wrapped_data)

2023-05-16 19:01:48.222054: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,128]
	 [[{{node while/Placeholder_2}}]]
2023-05-16 19:01:48.377449: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,128]
	 [[{{node while/Placeholder_2}}]]
2023-05-16 19:01:48.431992: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and 

INFO:tensorflow:Assets written to: /var/folders/4q/3_bfyqnn7yv5jcjq98x2jf680000gn/T/giskard-model-fr_6he4o/data/model/assets
2023-05-16 19:02:03,485 pid:10647 MainThread tensorflow   INFO     Assets written to: /var/folders/4q/3_bfyqnn7yv5jcjq98x2jf680000gn/T/giskard-model-fr_6he4o/data/model/assets




Your binary classification model prediction is of the shape (10, 1). 
In Giskard we expect the shape (10, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None
Your binary classification model prediction is of the shape (10, 1). 
In Giskard we expect the shape (10, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None
Your model is successfully validated.
Running scan…
Your binary classification model prediction is of the shape (500, 1). 
In Giskard we expect the shape (500, 2) for binary classification models. 
We automatically inferred the second class prediction but please

In [97]:
display(scanning_result)