<a href="https://www.kaggle.com/code/lonnieqin/natural-language-inference-with-distilbert?scriptVersionId=113811132" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Natural Language Inference with DistilBert

In [None]:
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertModel
import matplotlib.pyplot as plt
import tensorflow as tf
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import gc

## Distribution Strategy

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
print('Number of replicas:', strategy.num_replicas_in_sync)

## Configuration

In [None]:
class CFG:
    batch_size = strategy.num_replicas_in_sync * 16
    sequence_length = 128
    add_external_dataset = True

## Load data

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
train.head()

## Load Pretrained model

In [None]:
with strategy.scope():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    encoder = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
    gc.collect()

In [None]:
encoder.summary()

Let's learn about this pretrained model's input and output. When we past a text list to this pretrained model, it returns a dictionary with  key last_hidden_state.

In [None]:
texts = ["hello world.", "how are you doing?"]
text_preprocessed = tokenizer(
    texts, 
    max_length = CFG.sequence_length,
    truncation=True, 
    padding='max_length',
    add_special_tokens=True,
    return_tensors='tf'
)
output = encoder(text_preprocessed)
output

## Text Preprocessing

In [None]:
def preprocess_data(df):
    df["text"] = "[CLS] " + df["premise"] + " [SEP] " + df["hypothesis"] + " [SEP]"
    return df

In [None]:
%%time
train = preprocess_data(train)
train.head()

## 

## Load MNLI dataset
You can learn more about this dataset [here](https://huggingface.co/datasets/multi_nli).

In [None]:
def load_mnli(use_validation=True):
    result=[]
    dataset=load_dataset('multi_nli')
    print(dataset)
    for record in dataset['train']:
        c1, c2, c3 = record['premise'],record['hypothesis'], record['label']
        if c1 and c2 and c3 in {0, 1, 2}:
            result.append((c1, c2, c3, 'en'))
    result=pd.DataFrame(result, columns=['premise', 'hypothesis', 'label', 'lang_abv'])
    return result

In [None]:
mnli = load_mnli()
mnli.head()

In [None]:
mnli = preprocess_data(mnli)
mnli.head()

## Create TensorFlow dataset

In [None]:
def bert_encode(df, tokenizer):    
    texts = df['text'].tolist()
    tokens = tokenizer(
        texts, 
        max_length = CFG.sequence_length,
        truncation=True, 
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    if "label" in df.keys():
        tokens["label"] = df["label"]
    return tokens

def preprocess(features):
    input_ids = features.pop("input_ids")
    labels = features.pop("label")
    return input_ids, labels
def make_dataset(df, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices((df))
    if mode == "train":
        ds = ds.shuffle(256)
    ds = ds.batch(CFG.batch_size)
    ds = ds.map(preprocess)
    ds = ds.cache().prefetch(tf.data.AUTOTUNE).repeat()
    return ds

In [None]:
%%time
train_data, valid_data = train_test_split(train, test_size=0.2, random_state=2)
if CFG.add_external_dataset:
    train_data=pd.concat([train_data, mnli.loc[:100000]], axis=0)
    train_data.head()

In [None]:
%%time
train_input = bert_encode(train_data, tokenizer)
valid_input = bert_encode(valid_data, tokenizer)
train_ds = make_dataset(train_input)
valid_ds = make_dataset(valid_input, mode="valid")

Take a look at what training data looks like.

In [None]:
for item in train_ds.take(1):
    print(item)

## Calcuate class weight

The dataset is relatively balanced. However I would like to add class_weight parameter in keras training method. In this way we often can improve the score a little bit.

In [None]:
train_data["label"].value_counts().plot(kind="bar")

In [None]:
class_weight = len(train_data["label"]) / train_data["label"].value_counts()
class_weight = dict(class_weight / class_weight.sum())
class_weight

## Building Model

In [None]:
def build_model():
    inputs = tf.keras.Input(shape=(CFG.sequence_length,), dtype=tf.int32, name="input_ids")
    embedding = encoder(inputs)["last_hidden_state"]
    vector = tf.keras.layers.GlobalAveragePooling1D()(embedding)
    vector = tf.keras.layers.Dropout(0.3)(vector)
    output = tf.keras.layers.Dense(3, activation='softmax')(vector)
      
    model = tf.keras.Model(inputs=inputs, outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])   
    return model 

In [None]:
with strategy.scope():
    model = build_model()
    model.summary() 

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

## Training the model
When I train the model using TPU and and try to save the whole model, error occurs. I solve this issue by saving weights only and save to h5 format. Training model on GPU doesn't have such issue.

In [None]:
with strategy.scope():
    steps_per_epoch = train_data.shape[0] // CFG.batch_size
    validation_steps = valid_data.shape[0] // CFG.batch_size
    es = tf.keras.callbacks.EarlyStopping(patience=5, monitor="val_accuracy")
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        "model.h5", 
        monitor="val_accuracy", 
        save_best_only=True, 
        save_weights_only=True, 
        restore_best_weights=True
    )
    history = model.fit(
        train_ds, 
        epochs = 20, 
        steps_per_epoch = steps_per_epoch,
        validation_steps = validation_steps,
        validation_data=valid_ds,
        class_weight=class_weight,
        callbacks=[es, checkpoint]
    )
    pd.DataFrame(history.history).plot()

## Create Submission file

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test = preprocess_data(test)
test_input = bert_encode(test, tokenizer)
test_ds = tf.data.Dataset.from_tensor_slices((test_input["input_ids"])).batch(CFG.batch_size)
predictions = np.argmax(model.predict(test_ds), axis=1)
submission = test.id.copy().to_frame()
submission['prediction'] = predictions
submission.to_csv("submission.csv", index = False)
submission.head()