In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, concatenate
from tensorflow.keras.models import Model
from transformers import TFAutoModel, AutoTokenizer

# Load the train and test datasets
train_df = pd.read_csv('/kaggle/input/watson/train.csv')
test_df = pd.read_csv('/kaggle/input/watson/test.csv')

# Define the mapping of language abbreviations to full names
language_mapping = {'ar':'Arabic', 'bg':'Bulgarian', 'de':'German', 'el':'Greek', 'en':'English', 'es':'Spanish', 'fr':'French', 'hi':'Hindi', 'ru':'Russian', 'sw':'Swahili', 'th':'Thai', 'tr':'Turkish', 'ur':'Urdu', 'vi':'Vietnamese', 'zh':'Chinese'}

# Convert language abbreviations to full names in the train and test datasets
train_df['language'] = train_df['language'].apply(lambda x: language_mapping.get(x, x))
test_df['language'] = test_df['language'].apply(lambda x: language_mapping.get(x, x))

# Define the maximum sequence length for the inputs
MAX_LEN = 128

# Load the pre-trained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the train and test data using the BERT tokenizer
train_encoded = tokenizer(list(train_df['premise']), list(train_df['hypothesis']), padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf')
test_encoded = tokenizer(list(test_df['premise']), list(test_df['hypothesis']), padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf')

# Convert the labels to numpy arrays
train_labels = np.array(train_df['label'])

# Split the train dataset into a training set and a validation set
VALID_SPLIT = 0.2
idx = int(len(train_encoded['input_ids']) * (1 - VALID_SPLIT))
train_inputs = {k: v[:idx] for k, v in train_encoded.items()}
val_inputs = {k: v[idx:] for k, v in train_encoded.items()}
train_labels = train_labels[:idx]
val_labels = train_labels[idx:]

# Define the BERT-based NLI model
def create_model():
    bert_model = TFAutoModel.from_pretrained('bert-base-multilingual-cased')
    input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='token_type_ids')
    bert_output = bert_model({'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids})[1]
    bert_output = Dropout(0.2)(bert_output)
    dense1 = Dense(128, activation='relu')(bert_output)
    dense1 = Dropout(0.2)(dense1)
    dense2 = Dense(64, activation='relu')(dense1)
    output = Dense(3, activation='softmax')(dense2)
    model = Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Create the BERT-based NLI model
model = create_model()

# Train the model on the train dataset
BATCH_SIZE = 32
EPOCHS = 3
history = model.fit(train_inputs, train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(val_inputs, val_labels))

# Evaluate the model on the validation set
model.evaluate(val_inputs, val_labels)

# Make predictions on the test set
test_predictions = np.argmax(model.predict(test_encoded), axis=-1)

# Save the predictions to a CSV file in the required format
submission_df = pd.DataFrame({'id': test_df['id'], 'prediction': test_predictions})
submission_df.to_csv('submission.csv', index=False)


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)"tf_model.h5";:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/3
 11/303 [>.............................] - ETA: 1:48:21 - loss: 1.3434 - accuracy: 0.3210