# 02-02 : Zero Shot on Train Data

In [1]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [2]:
import pandas as pd
import numpy as np
from functools import partial
from typing import Dict, List
from pprint import pprint
from pqdm.threads import pqdm
from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, jaccard_score, accuracy_score, f1_score
from transformers import pipeline

2024-05-20 08:36:55.506454: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-20 08:36:55.506482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-20 08:36:55.507390: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-20 08:36:55.511542: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
data_path = '../../data'
input_path = f'{data_path}/input/labelled_tweets/csv_labels'
train_input_file = f'{input_path}/train.csv'
test_input_file = f'{input_path}/test.csv'
val_input_file = f'{input_path}/val.csv'
output_path = f'{data_path}/output/02_zero_shot'

## 1. Load Data

In [4]:
df_train = pd.read_csv(train_input_file)
df_val = pd.read_csv(val_input_file)
df_test = pd.read_csv(test_input_file)

# show the data frame shapes
print(f'Train shape: {df_train.shape}')
print(f'Val shape: {df_val.shape}')
print(f'Test shape: {df_test.shape}')

Train shape: (6957, 3)
Val shape: (987, 3)
Test shape: (1977, 3)


## 2. Preprocessing

### 2.1. Labels to List

In [5]:
df_train['labels_list'] = df_train['labels'].str.split(' ')
df_test['labels_list'] = df_test['labels'].str.split(' ')
df_val['labels_list'] = df_val['labels'].str.split(' ')

### 2.2. Multi-label Binarization

In [6]:
# get the list of label values
labels = pd.concat([df_train.labels_list, 
                    df_val.labels_list, 
                    df_test.labels_list])

# initialize MultiLabelBinarizer
labels_lookup = MultiLabelBinarizer()

# learn the vocabulary
labels_lookup = labels_lookup.fit(labels)

# show the vocabulary
vocab = labels_lookup.classes_
print(f'Vocabulary size: {len(vocab)}')
print(f'Vocabulary: {vocab}')


Vocabulary size: 12
Vocabulary: ['conspiracy' 'country' 'ineffective' 'ingredients' 'mandatory' 'none'
 'pharma' 'political' 'religious' 'rushed' 'side-effect' 'unnecessary']


In [7]:
# update the data frame with a `labels_encoded` column
df_train['labels_encoded'] = labels_lookup.transform(df_train.labels_list).tolist()
df_val['labels_encoded'] = labels_lookup.transform(df_val.labels_list).tolist()
df_test['labels_encoded'] = labels_lookup.transform(df_test.labels_list).tolist()

In [8]:
# add the one-hot encoded labels as columns to the data frames
df_train = df_train.join(pd.DataFrame(labels_lookup.transform(df_train.labels_list), 
                                     columns=labels_lookup.classes_, 
                                     index=df_train.index))

df_val = df_val.join(pd.DataFrame(labels_lookup.transform(df_val.labels_list),
                                    columns=labels_lookup.classes_,
                                    index=df_val.index))

df_test = df_test.join(pd.DataFrame(labels_lookup.transform(df_test.labels_list),
                                    columns=labels_lookup.classes_,
                                    index=df_test.index))

## 3. Classification

### 3.1. Create Classifier

In [9]:
# the model that will be used for classification
model_name = 'facebook/bart-large-mnli'

# create the classifier
classifier = pipeline("zero-shot-classification",
                      model=model_name)

2024-05-20 08:36:58.367765: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-20 08:36:58.396199: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-20 08:36:58.396406: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

### 3.2. Get Standardized Predictions

Standardize the prediction to match the order of the labels in the training set.

In [10]:
def standardize_prediction(prediction: Dict, vocabulary:List[str]) -> List[float]:
    """
    Standardize the prediction output to a fixed length list.
    """
    return [prediction['scores'][prediction['labels'].index(label)]
            for label in vocabulary]

### 3.3. Get Predictions

In [11]:
def get_prediction(text:str, classifier, vocabulary:List[str]) -> List[float]:
    """
    Get the prediction for a given text.
    """
    result = classifier(
        sequences=text,
        candidate_labels=vocabulary,
        hypothesis_template='This concern with the vaccine is about {}.',
        multi_label=True)
    
    return standardize_prediction(result, vocabulary)

In [12]:
def predict(X: List[str], vocabulary:List[str], classifier, n_jobs:int=1) -> List[List[float]]:
    """
    Predict the labels for a list of texts.
    """
    if n_jobs == 1:
        result = []
        for text in tqdm(X):
            result.append(get_prediction(text, classifier, vocabulary))
            
        return result
    else:
        # create the partial function for parallel processing
        get_prediction_partial = partial(get_prediction, classifier=classifier, vocabulary=vocabulary)
    
        # perform parallel processing 
        return pqdm(X, get_prediction_partial, n_jobs=5)

## 4. Evaluating the model

In [13]:
class Evaluation:

    @staticmethod
    def f1_score_macro(y_true, y_pred):
        """Calculate F1-score (Macro-Average)."""
        return f1_score(y_true, y_pred, average='macro', zero_division=0)

    @staticmethod
    def f1_score_weighted(y_true, y_pred):
        """Calculate F1-score (Weighted-Average)."""
        return f1_score(y_true, y_pred, average='weighted', zero_division=0)

    @staticmethod
    def jaccard_similarity(y_true, y_pred):
        """Calculate average Jaccard Similarity."""
        return jaccard_score(y_true, y_pred, average='samples')

    @staticmethod
    def subset_accuracy(y_true, y_pred):
        """Calculate Subset Accuracy (Exact Match Accuracy)."""
        return accuracy_score(y_true, y_pred)

    @staticmethod
    def evaluate_all(y_true,
                     y_pred,
                     threshold:float=0.5):
        
        # Convert predictions to binary
        y_pred_bin = [[int(prob > threshold) for prob in pred] for pred in y_pred]
        
        """Evaluate all metrics and display a summary."""
        f1_macro = Evaluation.f1_score_macro(y_true, y_pred_bin)
        f1_weighted = Evaluation.f1_score_weighted(y_true, y_pred_bin)
        jaccard_similarity = Evaluation.jaccard_similarity(y_true, y_pred_bin)
        subset_accuracy = Evaluation.subset_accuracy(y_true, y_pred_bin)

        # Display a summary of the evaluation
        print(f"F1 Score (Macro-Average)   \t{f1_macro:.3f}")
        print(f"F1 Score (Weighted-Average)\t{f1_weighted:.3f}")
        print(f"Average Jaccard Similarity \t{jaccard_similarity:.3f}")
        print(f"Subset Accuracy            \t{subset_accuracy:.3f}")

### 4.1 Perform Predictions

In [14]:
data = df_test

# get the true values
y_true = data[vocab].values.tolist()

# get the predictions
y_pred = predict(X=data.text.tolist(),
        vocabulary=vocab.tolist(),
        classifier=classifier,
        n_jobs=2)

QUEUEING TASKS | :   0%|          | 0/1977 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1977 [00:00<?, ?it/s]

In [None]:
# save y_true to a file
np.save(f'{output_path}/02-01_bart-large-mnli_test_true.npy', y_true)

# save the predictions to a file
np.save(f'{output_path}/02-01_bart-large-mnli_test.npy', y_pred)

### 4.2. Classification Report

In [None]:
def show_classification_report(data:pd.DataFrame,
                               y_pred:np.ndarray,
                               threshold:float=0.5):
    # get the true labels
    y_true = data[vocab].values
    
    # Convert predictions to binary
    y_pred_bin = [[int(prob > threshold) for prob in pred] for pred in y_pred]
    
    # show the classification report
    print(classification_report(y_true, y_pred_bin, target_names=vocab))    

In [None]:
# show the test classification report
show_classification_report(data, y_pred)

### 4.3 Full Report 

In [None]:
Evaluation.evaluate_all(y_true, y_pred)