# Movie Review Sentiment Classification with DISTILL-BERT [sklearn + torch preprocessing]
Binary sentiment classification of movies' reviews.
Reference notebook: <https://www.kaggle.com/code/atulanandjha/distillbert-extensive-tutorial-starter-kernel>
Dataset: <https://huggingface.co/datasets/SetFit/sst2/blob/main/train.jsonl>

## Import libraries

In [1]:
import os
import warnings

import torch
import numpy as np
import pandas as pd
import transformers as ppb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import giskard
from giskard import Model, Dataset

## Notebook-level settings

In [2]:
warnings.filterwarnings('ignore')

## Define constants

In [3]:
# Constants.
TARGET_NAME = "label"
TEXT_FEATURE_NAME = "text"

PRETRAINED_WEIGHTS_NAME = "distilbert-base-uncased"

# Paths.
TRAIN_FILENAME = "train.jsonl"
DATA_DIRECTORY = os.path.join(".", "datasets", "movie_review_sentiment_classification_dataset")

## Load data

In [4]:
def load_data(**kwargs) -> pd.DataFrame:
    """Load data."""
    print(f"Loading data...")
    df = pd.read_json(os.path.join(DATA_DIRECTORY, TRAIN_FILENAME), lines=True, **kwargs)
    df = df.drop("label_text", axis=1)
    print(f"Data of shape: {df.shape} is loaded!")
    return df

reviews_df = load_data(nrows=2000)

Loading data...
Data of shape: (2000, 2) is loaded!


## Train-Test split

In [5]:
train_df, test_df = train_test_split(reviews_df, random_state=0)
print(f"Train size: {len(train_df)}\n"
      f"Test size: {len(test_df)}")

Train size: 1500
Test size: 500


## Load BERT model and Tokenizer

In [6]:
bert_model = ppb.DistilBertModel.from_pretrained(PRETRAINED_WEIGHTS_NAME)
tokenizer = ppb.DistilBertTokenizer.from_pretrained(PRETRAINED_WEIGHTS_NAME)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Define preprocessing function

In [7]:
def get_longest_sequence_length(df: pd.DataFrame) -> int:
    """Define a length of the longest tokenized sentence."""
    max_length = max(len(tokenizer.encode(row, add_special_tokens=True)) for row in df.text)
    return max_length

MAX_SEQUENCE_LENGTH = get_longest_sequence_length(reviews_df)
print(f"Longest sequence length: {MAX_SEQUENCE_LENGTH}")

Longest sequence length: 64


## Train-test split

In [8]:
def preprocessing_function(df: pd.DataFrame) -> np.ndarray:
    """Preprocessing function to be also used in 'giskard.Model'."""
    def _tokenize(_df: pd.DataFrame) -> np.ndarray:
        """Tokenization step."""
        _tokens_matrix = _df.text.apply(lambda x: tokenizer.encode(x, add_special_tokens=True)).values
        return _tokens_matrix

    def _pad(_tokens_matrix: np.ndarray) -> np.ndarray:
        """Padding with zeros step."""
        _padded_tokens_matrix = np.array([_tokens_sequence + [0] * (MAX_SEQUENCE_LENGTH - len(_tokens_sequence))
                                          for _tokens_sequence in _tokens_matrix])
        return _padded_tokens_matrix

    def _get_attention_matrix(_padded_tokens_matrix: np.ndarray) -> np.ndarray:
        """Calculate attention mask to ignore zero-padded tokens."""
        _attention_mask = np.where(_padded_tokens_matrix != 0, 1, 0)
        return _attention_mask

    def _get_bert_embeddings(_padded_tokens_matrix: np.ndarray, _attention_mask: np.ndarray) -> np.ndarray:
        """Calculate sentence embeddings using distill-BERT model."""
        # Prepare inputs for the torch model.
        _padded_tokens_matrix = torch.tensor(_padded_tokens_matrix)
        _attention_mask = torch.tensor(_attention_mask)

        # Perform BERT inference to get embeddings.
        with torch.no_grad():
            _last_hidden_states = bert_model(_padded_tokens_matrix, attention_mask=_attention_mask)

        # Take just 'cls token' embeddings, which represent whole sentence embedding for further classification.
        _embeddings = _last_hidden_states[0][:, 0, :].numpy()
        return _embeddings

    tokens_matrix = _tokenize(df)
    padded_tokens_matrix = _pad(tokens_matrix)
    attention_matrix = _get_attention_matrix(padded_tokens_matrix)
    embeddings = _get_bert_embeddings(padded_tokens_matrix, attention_matrix)

    return embeddings

In [9]:
print(f"Preprocessing train set...")
train_X, train_Y = preprocessing_function(train_df), train_df.label
print(f"Finished preprocessing train set!")

print(f"Preprocessing test set...")
test_X, test_Y = preprocessing_function(test_df), test_df.label
print(f"Finished preprocessing test set!")

Preprocessing train set...
Finished preprocessing train set!
Preprocessing test set...
Finished preprocessing test set!


## Build Logistic Regression estimator

In [10]:
print(f"Model training...")
classifier = LogisticRegression()
classifier.fit(train_X, train_Y)
print(f"Finished training!")

metric = classifier.score(test_X, test_Y)
print(f"Test accuracy: {metric}")

Model training...
Finished training!
Test accuracy: 0.82


## Wrap data and model and perform scanning

In [11]:
wrapped_data = Dataset(test_df, name="movie_reviews_sentiment", target=TARGET_NAME)

Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.


In [12]:
wrapped_model = Model(classifier,
                      model_type="classification",
                      data_preprocessing_function=preprocessing_function,
                      name="movie_review_sentiment_classifier",
                      feature_names=[TEXT_FEATURE_NAME],
                      classification_threshold=0.5)

Your 'model' is successfully wrapped by Giskard's 'SKLearnModel' wrapper class.


In [13]:
scanning_results = giskard.scan(wrapped_model, wrapped_data)

Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your model is successfully validated.
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your 'pandas.DataFram

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.