
# Step 0: Install necessary packages



In [1]:

!pip install transformers datasets torch




# Step 1: Import Libraries

In [2]:


from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import random


# Step 2: Load and Tokenize the English Dataset

# Download the XNLI dataset for English language.

In [3]:

dataset = load_dataset('xnli', 'en')

# Initialize the mBERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Define the function that tokenizes the premise and hypothesis text.
def tokenize_function(examples):
    # Convert the list of tokens (if not plain strings) to strings
    premise = [ex if isinstance(ex, str) else " ".join(ex) for ex in examples['premise']]
    hypothesis = [ex if isinstance(ex, str) else " ".join(ex) for ex in examples['hypothesis']]
    return tokenizer(premise, hypothesis, padding="max_length", truncation=True)

# Apply the tokenization function to the dataset.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Set output format for PyTorch.
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])



In [11]:
# Display first 5 examples from the training set
print("Training set examples:")
for i in range(5):
    print(f"\nExample {i+1}:")
    print(f"Premise: {dataset['train'][i]['premise']}")
    print(f"Hypothesis: {dataset['train'][i]['hypothesis']}")
    print(f"Label: {dataset['train'][i]['label']}")

Training set examples:

Example 1:
Premise: Conceptually cream skimming has two basic dimensions - product and geography .
Hypothesis: Product and geography are what make cream skimming work .
Label: 1

Example 2:
Premise: you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him
Hypothesis: You lose the things to the following level if the people recall .
Label: 0

Example 3:
Premise: One of our number will carry out your instructions minutely .
Hypothesis: A member of my team will execute your orders with immense precision .
Label: 0

Example 4:
Premise: How do you know ? All this is their information again .
Hypothesis: This information belongs to them .
Label: 0

Example 5:
Premise: yeah i tell you what though if you go price some of those tennis shoes i can see w

In [12]:
# Accessing different splits
train_data = dataset['train']
test_data = dataset['test']
validation_data = dataset['validation']

In [26]:
import pandas as pd
pd.DataFrame(test_data.select(range(5)))

Unnamed: 0,premise,hypothesis,label
0,"Well, I wasn't even thinking about that, but I...",I havent spoken to him again.,2
1,"Well, I wasn't even thinking about that, but I...",I was so upset that I just started talking to ...,0
2,"Well, I wasn't even thinking about that, but I...",We had a great talk.,1
3,"And I thought that was a privilege, and it's s...",I was not aware that I was not the only person...,1
4,"And I thought that was a privilege, and it's s...",I was under the impression that I was the only...,0


# Step 3: Sample a Subset for Training and Validation
# For quicker training, we randomly sample a small subset.


In [4]:
random.seed(42)
train_indices = random.sample(range(len(tokenized_datasets['train'])), 1000)
val_indices = random.sample(range(len(tokenized_datasets['validation'])), 500)

train_dataset = tokenized_datasets['train'].select(train_indices)
val_dataset = tokenized_datasets['validation'].select(val_indices)


# Step 4: Load the mBERT Model for Sequence Classification

# Create a classification model for 3 labels.

In [5]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Step 5: Set Up Training Arguments and Trainer

In [30]:
!pip install transformers datasets torch accelerate>=0.26.0

In [6]:
training_args = TrainingArguments(
    output_dir='./results',              # Output directory for model predictions and checkpoints
    evaluation_strategy="epoch",         # Evaluate model every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,                  # Train for 3 epochs
    weight_decay=0.01,
    fp16=True,                           # Use mixed precision training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,         # English training dataset
    eval_dataset=val_dataset,            # English validation dataset
)

# Fine-tune the model on the English dataset.
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,1.113663


KeyboardInterrupt: 


# Step 6: Evaluate the Model on a French Dataset for Cross-Lingual Transfer

# Download the French version of the XNLI dataset.

In [None]:
french_dataset = load_dataset('xnli', 'fr')

# Tokenize the French dataset using the same function.
tokenized_french_dataset = french_dataset.map(tokenize_function, batched=True)
tokenized_french_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Use the validation portion for evaluation.
french_val_dataset = tokenized_french_dataset['validation']

# Evaluate the model on the French validation set.
results = trainer.evaluate(french_val_dataset)
print("Evaluation results on French dataset:", results)