# 02-01 : Zero Shot Text Classification

## References

- [Unlocking Zero-Shot Text Classification with Hugging Face’s Transformers](https://medium.com/@s.sadathosseini/unlocking-zero-shot-text-classification-with-hugging-faces-transformers-9e30de5c8455)
- [Aspect Mining Using Zero-Shot Classification](https://aiswaryaramachandran.medium.com/aspect-mining-using-zero-shot-classification-3190e8a89d68)
- [Exploring Hugging Face: Zero-Shot Classification](https://pub.aimind.so/exploring-hugging-face-zero-shot-classification-781ef3a18510)
- [Zero Shot Classification with Huggingface 🤗 + Sentence Transformers](https://sachin-abeywardana.medium.com/zero-shot-classification-with-huggingface-sentence-transformers-c6cd732de0e0)
- [Analyzing QAnon on Twitter with Zero-Shot Classification](https://towardsdatascience.com/analyzing-qanon-on-twitter-with-zero-shot-classification-13ad73d324fc)
- [MoritzLaurer/deberta-v3-large-zeroshot-v2.0](https://huggingface.co/MoritzLaurer/deberta-v3-large-zeroshot-v2.0)
- [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli)

 ### Interesting Models

- [FacebookAI/roberta-large-mnli](https://huggingface.co/FacebookAI/roberta-large-mnli) - fine-tuned on the Multi-Genre Natural Language Inference (MNLI) corpus.
- [MoritzLaurer/deberta-v3-large-zeroshot-v2.0](https://huggingface.co/MoritzLaurer/deberta-v3-large-zeroshot-v2.0)
- [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli)

In [None]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [None]:
import pandas as pd
import numpy as np
from functools import partial
from typing import Dict, List
from pprint import pprint
from pqdm.threads import pqdm
from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, jaccard_score, accuracy_score, f1_score
from transformers import pipeline

In [None]:
data_path = '../../data'
input_path = f'{data_path}/input/labelled_tweets/csv_labels'
train_input_file = f'{input_path}/train.csv'
test_input_file = f'{input_path}/test.csv'
val_input_file = f'{input_path}/val.csv'

## 1. Load Data

In [None]:
df_train = pd.read_csv(train_input_file)
df_val = pd.read_csv(val_input_file)
df_test = pd.read_csv(test_input_file)

# show the data frame shapes
print(f'Train shape: {df_train.shape}')
print(f'Val shape: {df_val.shape}')
print(f'Test shape: {df_test.shape}')

In [None]:
df_train.head()

## 2. Preprocessing

### 2.1. Labels to List

In [None]:
df_train['labels_list'] = df_train['labels'].str.split(' ')
df_test['labels_list'] = df_test['labels'].str.split(' ')
df_val['labels_list'] = df_val['labels'].str.split(' ')

### 2.2. Multi-label Binarization

In [None]:
# get the list of label values
labels = pd.concat([df_train.labels_list, 
                    df_val.labels_list, 
                    df_test.labels_list])

# initialize MultiLabelBinarizer
labels_lookup = MultiLabelBinarizer()

# learn the vocabulary
labels_lookup = labels_lookup.fit(labels)

# show the vocabulary
vocab = labels_lookup.classes_
print(f'Vocabulary size: {len(vocab)}')
print(f'Vocabulary: {vocab}')


In [None]:
# update the data frame with a `labels_encoded` column
df_train['labels_encoded'] = labels_lookup.transform(df_train.labels_list).tolist()
df_val['labels_encoded'] = labels_lookup.transform(df_val.labels_list).tolist()
df_test['labels_encoded'] = labels_lookup.transform(df_test.labels_list).tolist()

In [None]:
# add the one-hot encoded labels as columns to the data frames
df_train = df_train.join(pd.DataFrame(labels_lookup.transform(df_train.labels_list), 
                                     columns=labels_lookup.classes_, 
                                     index=df_train.index))

df_val = df_val.join(pd.DataFrame(labels_lookup.transform(df_val.labels_list),
                                    columns=labels_lookup.classes_,
                                    index=df_val.index))

df_test = df_test.join(pd.DataFrame(labels_lookup.transform(df_test.labels_list),
                                    columns=labels_lookup.classes_,
                                    index=df_test.index))

In [None]:
df_train.head()

## 3. Classification

### 3.1. Create Classifier

In [None]:
# the model that will be used for classification
model_name = 'facebook/bart-large-mnli'

# create the classifier
classifier = pipeline("zero-shot-classification",
                      model=model_name)


### 3.2. Test Classifier 

In [None]:
# select a row for testing
sample_row = df_train.iloc[146][['text', 'labels_list', 'labels_encoded']]
pprint(sample_row.to_dict())

In [None]:
# perform classification
result = classifier(
    sequences=sample_row.text,
    candidate_labels=vocab,
    hypothesis_template='This concern with the vaccine is about {}.',
    multi_label=True)

pprint(result)

### 3.3. Get Standardized Predictions

Standardize the prediction to match the order of the labels in the training set.

In [None]:
def standardize_prediction(prediction: Dict, vocabulary:List[str]) -> List[float]:
    """
    Standardize the prediction output to a fixed length list.
    """
    return [prediction['scores'][prediction['labels'].index(label)]
            for label in vocabulary]

## test the function
#standardize_prediction(result, vocab.tolist())

### 3.4. Get Predictions

In [None]:
def get_prediction(text:str, classifier, vocabulary:List[str]) -> List[float]:
    """
    Get the prediction for a given text.
    """
    result = classifier(
        sequences=text,
        candidate_labels=vocabulary,
        hypothesis_template='This concern with the vaccine is about {}.',
        multi_label=True)
    
    return standardize_prediction(result, vocabulary)

## test the function
#get_prediction(sample_row.text, classifier, vocab.tolist())

In [None]:
def predict(X: List[str], vocabulary:List[str], classifier, n_jobs:int=1) -> List[List[float]]:
    """
    Predict the labels for a list of texts.
    """
    if n_jobs == 1:
        result = []
        for text in tqdm(X):
            result.append(get_prediction(text, classifier, vocabulary))
            
        return result
    else:
        # create the partial function for parallel processing
        get_prediction_partial = partial(get_prediction, classifier=classifier, vocabulary=vocabulary)
    
        # perform parallel processing 
        return pqdm(X, get_prediction_partial, n_jobs=5)
        
## test the function
# predict(
#     X=df_train[:5].text.tolist(), 
#     vocabulary=vocab.tolist(),
#     classifier=classifier,
#     n_jobs=2)

## 4. Evaluating the model

In [None]:
class Evaluation:

    @staticmethod
    def f1_score_macro(y_true, y_pred):
        """Calculate F1-score (Macro-Average)."""
        return f1_score(y_true, y_pred, average='macro', zero_division=0)

    @staticmethod
    def f1_score_weighted(y_true, y_pred):
        """Calculate F1-score (Weighted-Average)."""
        return f1_score(y_true, y_pred, average='weighted', zero_division=0)

    @staticmethod
    def jaccard_similarity(y_true, y_pred):
        """Calculate average Jaccard Similarity."""
        return jaccard_score(y_true, y_pred, average='samples')

    @staticmethod
    def subset_accuracy(y_true, y_pred):
        """Calculate Subset Accuracy (Exact Match Accuracy)."""
        return accuracy_score(y_true, y_pred)

    @staticmethod
    def evaluate_all(y_true,
                     y_pred,
                     threshold:float=0.5):
        
        # Convert predictions to binary
        y_pred_bin = [[int(prob > threshold) for prob in pred] for pred in y_pred]
        
        """Evaluate all metrics and display a summary."""
        f1_macro = Evaluation.f1_score_macro(y_true, y_pred_bin)
        f1_weighted = Evaluation.f1_score_weighted(y_true, y_pred_bin)
        jaccard_similarity = Evaluation.jaccard_similarity(y_true, y_pred_bin)
        subset_accuracy = Evaluation.subset_accuracy(y_true, y_pred_bin)

        # Display a summary of the evaluation
        print(f"F1 Score (Macro-Average)   \t{f1_macro:.3f}")
        print(f"F1 Score (Weighted-Average)\t{f1_weighted:.3f}")
        print(f"Average Jaccard Similarity \t{jaccard_similarity:.3f}")
        print(f"Subset Accuracy            \t{subset_accuracy:.3f}")

### 4.1 Perform Predictions

In [None]:
data = df_train

# get the true values
y_true = data[vocab].values.tolist()

# get the predictions
y_pred = predict(X=data.text.tolist(),
        vocabulary=vocab.tolist(),
        classifier=classifier,
        n_jobs=2)

### 4.2. Classification Report

In [None]:
def show_classification_report(data:pd.DataFrame,
                               y_pred:np.ndarray,
                               threshold:float=0.5):
    # get the true labels
    y_true = data[vocab].values
    
    # Convert predictions to binary
    y_pred_bin = [[int(prob > threshold) for prob in pred] for pred in y_pred]
    
    # show the classification report
    print(classification_report(y_true, y_pred_bin, target_names=vocab))    

In [None]:
# show the test classification report
show_classification_report(data, y_pred)

### 4.3 Full Report 

In [None]:
Evaluation.evaluate_all(y_true, y_pred)