<a href="https://www.kaggle.com/code/lonnieqin/natural-language-inference-with-bert?scriptVersionId=113732726" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Natural Language Inference with BERT

In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, TFBertModel, AutoTokenizer, TFAutoModel
import matplotlib.pyplot as plt
import tensorflow as tf
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import gc

## Distribution Strategy

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
print('Number of replicas:', strategy.num_replicas_in_sync)

## Configuration

In [None]:
class CFG:
    batch_size = strategy.num_replicas_in_sync * 16
    sequence_length = 64

## Load data

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
train.head()

## Load Pretrained model

In [None]:
with strategy.scope():
    model_roBerta ='joeddav/xlm-roberta-large-xnli'
    tokenizer = AutoTokenizer.from_pretrained(model_roBerta)
    encoder = TFAutoModel.from_pretrained(model_roBerta)
    gc.collect()

## Load MNLI dataset
You can learn more about this dataset [here](https://huggingface.co/datasets/multi_nli).

In [None]:
def load_mnli(use_validation=True):
    result=[]
    dataset=load_dataset('multi_nli')
    print(dataset)
    for record in dataset['train']:
        c1, c2, c3 = record['premise'],record['hypothesis'], record['label']
        if c1 and c2 and c3 in {0, 1, 2}:
            result.append((c1, c2, c3, 'en'))
    result=pd.DataFrame(result, columns=['premise', 'hypothesis', 'label', 'lang_abv'])
    return result

In [None]:
mnli = load_mnli()
mnli

In [None]:
train=pd.concat([train, mnli.loc[:100000]], axis=0)
train.head()

## Create tensorflow dataset

In [None]:
def bert_encode(df, tokenizer):    
    batch_premises = df['premise'].tolist()
    batch_hypothesis = df['hypothesis'].tolist()

    tokens = tokenizer(
        batch_premises, 
        batch_hypothesis, 
        max_length = CFG.sequence_length,
        truncation=True, 
        padding='max_length',
        add_special_tokens=True, 
        return_attention_mask=True,
        return_tensors='tf'
    )
    inputs = {
        'input_ids': tokens['input_ids'], 
        'attention_mask': tokens['attention_mask']
    }
    if "label" in df.keys():
        inputs["label"] = df["label"]
    return inputs

def preprocess(features):
    labels = features.pop("label")
    return features, labels
def make_dataset(df, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices((df))
    if mode == "train":
        ds = ds.shuffle(256)
    ds = ds.batch(CFG.batch_size)
    ds = ds.map(preprocess)
    ds = ds.cache().prefetch(tf.data.AUTOTUNE).repeat()
    return ds

In [None]:
%%time
train_data, valid_data = train_test_split(train, test_size=0.2, random_state=2)
train_input = bert_encode(train_data, tokenizer)
valid_input = bert_encode(valid_data, tokenizer)
train_ds = make_dataset(train_input)
valid_ds = make_dataset(valid_input, mode="valid")

Take a look at this dataset.

In [None]:
for item in train_ds.take(1):
    print(item)

The dataset is relatively balanced. However I would like to add class_weight parameter in kera training method. This still can improve score a litte bit.

In [None]:
train_data["label"].value_counts().plot(kind="bar")

In [None]:
class_weight = len(train_data["label"]) / train_data["label"].value_counts()
class_weight /= class_weight.sum()
class_weight = dict(class_weight)
class_weight

## Calcuate class weight

## Building Model

In [None]:
from tensorflow.keras import regularizers

def build_model():
    input_ids = tf.keras.Input(shape=(CFG.sequence_length,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(CFG.sequence_length,), dtype=tf.int32, name="attention_mask")
    embedding = encoder([input_ids, attention_mask])[0]
    vector = tf.keras.layers.GlobalAveragePooling1D()(embedding)
    vector = tf.keras.layers.Dropout(0.3)(vector)
    output = tf.keras.layers.Dense(3, activation='softmax')(vector)
      
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])   
    return model 

In [None]:
with strategy.scope():
    model = build_model()
    model.summary() 

In [None]:
steps_per_epoch = train_data.shape[0] // CFG.batch_size
validation_steps = valid_data.shape[0] // CFG.batch_size

In [None]:
with strategy.scope():
    es = tf.keras.callbacks.EarlyStopping(patience=5, monitor="val_accuracy", restore_best_weights=True)
    history = model.fit(
        train_ds, 
        epochs = 20, 
        steps_per_epoch = steps_per_epoch,
        validation_steps = validation_steps,
        validation_data=valid_ds,
        class_weight=class_weight,
        callbacks=[es]
    )
    pd.DataFrame(history.history).plot()

## Create Submission file

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_input = bert_encode(test, tokenizer)
test_ds = tf.data.Dataset.from_tensor_slices((test_input)).batch(CFG.batch_size)
predictions = np.argmax(model.predict(test_ds), axis=1)
submission = test.id.copy().to_frame()
submission['prediction'] = predictions
submission.to_csv("submission.csv", index = False)
submission.head()