# Movie Review Sentiment Classification with DISTILL-BERT [sklearn + torch preprocessing]
* Binary sentiment classification of movies' reviews.  
* Reference notebook: <https://www.kaggle.com/code/atulanandjha/distillbert-extensive-tutorial-starter-kernel>  
* Dataset: <https://huggingface.co/datasets/SetFit/sst2/blob/main/train.jsonl>  

## Import libraries

In [128]:
import os
from pathlib import Path
from urllib.request import urlretrieve

import torch
import numpy as np
import pandas as pd
import transformers as ppb
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from giskard import Model, Dataset, scan, testing
from giskard.client.giskard_client import GiskardClient

## Define constants

In [129]:
# Constants.
TARGET_COLUMN = "label"
TEXT_COLUMN = "text"

PRETRAINED_WEIGHTS_NAME = "distilbert-base-uncased"

RANDOM_STATE = 0

# Paths.
DATA_URL = os.path.join("ftp://sys.giskard.ai", "pub", "unit_test_resources", "movie_review_sentiment_classification_dataset", "train.jsonl")
DATA_PATH = Path.home() / ".giskard" / "movie_review_sentiment_classification_dataset" / "train.jsonl"

## Dataset preparation

### Load data

In [130]:
def fetch_from_ftp(url: str, file: Path) -> None:
    if not file.parent.exists():
        file.parent.mkdir(parents=True, exist_ok=True)

    if not file.exists():
        print(f"Downloading data from {url}")
        urlretrieve(url, file)

    print(f"Data was loaded!")
    

def load_data(**kwargs) -> pd.DataFrame:
    """Load data."""
    fetch_from_ftp(DATA_URL, DATA_PATH)
    
    df = pd.read_json(DATA_PATH, lines=True, **kwargs)
    df = df.drop(columns="label_text")
    
    return df


reviews_df = load_data()

Data was loaded!


### Train-Test split

In [131]:
train_df, test_df = train_test_split(reviews_df, random_state=RANDOM_STATE)

### Wrap dataset with Giskard

In [132]:
wrapped_data = Dataset(
    df=test_df,  # A pandas.DataFrame that contains the raw data (before all the pre-processing steps) and the actual ground truth variable (target).
    target=TARGET_COLUMN,  # Ground truth variable.
    name="Movie reviews dataset"  # Optional.
)

## Model training

### Define preprocessing steps

In [None]:
embedder = ppb.DistilBertModel.from_pretrained(PRETRAINED_WEIGHTS_NAME)
tokenizer = ppb.DistilBertTokenizer.from_pretrained(PRETRAINED_WEIGHTS_NAME)


def get_max_sequence_length(corpus: pd.Series) -> int:
    """Define a length of the longest tokenized document."""
    max_length = max(len(tokenizer.encode(document, add_special_tokens=True)) for document in corpus)
    return max_length


max_sequence_length = get_max_sequence_length(reviews_df[TEXT_COLUMN])


def tokenize_documents(corpus: pd.Series) -> torch.Tensor:
    """Tokenization step."""
    tokens_matrix = corpus.apply(lambda document: tokenizer.encode(document, add_special_tokens=True)).values
    tokens_matrix = torch.tensor([tokens_row + [0] * (max_sequence_length - len(tokens_row)) for tokens_row in tokens_matrix])
    return tokens_matrix


def get_documents_embeddings(tokens_matrix: torch.Tensor) -> np.ndarray:
    """Calculate sentence embeddings using distill-BERT model."""
    attention_mask = torch.where(tokens_matrix != 0, 1, 0)
    
    embedder.eval()
    with torch.no_grad():
        tokens_representations = embedder(tokens_matrix, attention_mask=attention_mask)

    # Take just 'cls token' embeddings, which represent whole sentence embedding.
    documents_embeddings = tokens_representations[0][:, 0, :].numpy()
    return documents_embeddings


def preprocess_text(df: pd.DataFrame) -> np.ndarray:
    """Preprocessing function to be also used in 'giskard.Model'."""
    return get_documents_embeddings(tokenize_documents(df[TEXT_COLUMN]))


X_train, Y_train = preprocess_text(train_df), train_df.label
X_test, Y_test = preprocess_text(test_df), test_df.label

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Build estimator

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

# Validate model.
train_score = classifier.score(X_train, Y_train)
print(f"Train accuracy: {train_score: .2f}")

test_score = classifier.score(X_test, Y_test)
print(f"Test accuracy: {test_score: .2f}")

### Wrap model with Giskard

In [None]:
def prediction_function(df: pd.DataFrame) -> np.ndarray:
    x = preprocess_text(df)
    return classifier.predict_proba(x)


wrapped_model = Model(
    model=prediction_function,  # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.
    model_type="classification",  # Either regression, classification or text_generation.
    name="movie_review_sentiment_classifier",  # Optional.
    classification_labels=classifier.classes_.tolist(),  # Their order MUST be identical to the prediction_function's output order.
    feature_names=[TEXT_COLUMN],  # Default: all columns of your dataset.
    # classification_threshold=0.5  # Default: 0.5.
)

Y_test_pred_wrapped = wrapped_model.predict(wrapped_data).prediction
wrapped_test_score = accuracy_score(Y_test, Y_test_pred_wrapped)
print(f"Wrapped test accuracy: {wrapped_test_score: .2f}")

In [None]:
results = scan(wrapped_model, wrapped_data)

In [None]:
display(results)

## Generate a test suite from the Scan
The objects produced by the scan can be used as fixtures to generate a test suite that integrate domain-specific issues. To create custom tests, refer to the Test your ML Model page.

In [None]:
test_suite = results.generate_test_suite("My first test suite")
test_suite.run()

## Customize your suite by loading objects from the Giskard catalog

The Giskard open source catalog will enable to load:

* Tests such as metamorphic, performance, prediction & data drift, statistical tests, etc
* Slicing functions such as detectors of toxicity, hate, emotion, etc
* Transformation functions such as generators of typos, paraphrase, style tune, etc

For demo purposes, we will load a simple unit test (test_f1) that checks if the test F1 score is above the given threshold. For more examples of tests and functions, refer to the Giskard catalog.

In [None]:
test_suite.add_test(testing.test_f1(model=wrapped_model, dataset=wrapped_data, threshold=0.7)).run()

## Upload your suite to the Giskard server

Upload your suite to the Giskard server to:

* Compare models to decide which model to promote
* Debug your tests to diagnose the issues
* Create more domain-specific tests that are integrating business feedback
* Share your results

In [None]:
# Uploading the test suite will automatically save the model, dataset, tests, slicing & transformation functions inside the Giskard UI server
# Create a Giskard client after having install the Giskard server (see documentation)
token = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsInRva2VuX3R5cGUiOiJBUEkiLCJhdXRoIjoiUk9MRV9BRE1JTiIsImV4cCI6MTcwMDc1MTkzMX0.NaVpvZ_bJzPeV8s3PnBjq2YyYIJWfooqeZXLPs8obr0"  # Find it in Settings in the Giskard server

client = GiskardClient(
    url="http://localhost:9000",  # URL of your Giskard instance
    token=token
)

my_project = client.create_project("movie_review_sentiment_classification", "movie_review_sentiment_classification", "movie_review_sentiment_classification")

# Upload to the current project ✉️
test_suite.upload(client, "movie_review_sentiment_classification")

<div class="alert alert-info">
Connecting Google Colab with the Giskard server

If you are using Google Colab and you want to install the Giskard server **locally**, you can run the Giskard server by executing this line in the terminal of your **local** machine (see the [documentation](https://docs.giskard.ai/en/latest/guides/installation_app/index.html)):

> giskard server start

Once the Giskard server is running, from the same terminal on your **local** machine, you can run:

> giskard server expose --token \<ngrok_API_token\>

Read the flowing [instructions](https://docs.giskard.ai/en/latest/cli/ngrok/index.html) in order to get the `ngrok_API_token`. This will provide you with the code snippets that you can copy and paste into your Colab notebook to establish a connection with your locally installed Giskard server
</div>