
# Step 0: Install necessary packages



In [None]:

!pip install transformers datasets torch


# Step 1: Import Libraries

In [None]:


from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import random


# Step 2: Load and Tokenize the English Dataset

# Download the XNLI dataset for English language.

In [None]:

dataset = load_dataset('xnli', 'en')

# Initialize the mBERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Define the function that tokenizes the premise and hypothesis text.
def tokenize_function(examples):
    # Convert the list of tokens (if not plain strings) to strings
    premise = [ex if isinstance(ex, str) else " ".join(ex) for ex in examples['premise']]
    hypothesis = [ex if isinstance(ex, str) else " ".join(ex) for ex in examples['hypothesis']]
    return tokenizer(premise, hypothesis, padding="max_length", truncation=True)

# Apply the tokenization function to the dataset.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Set output format for PyTorch.
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])



# Step 3: Sample a Subset for Training and Validation
# For quicker training, we randomly sample a small subset.


In [None]:
random.seed(42)
train_indices = random.sample(range(len(tokenized_datasets['train'])), 1000)
val_indices = random.sample(range(len(tokenized_datasets['validation'])), 500)

train_dataset = tokenized_datasets['train'].select(train_indices)
val_dataset = tokenized_datasets['validation'].select(val_indices)


# Step 4: Load the mBERT Model for Sequence Classification

# Create a classification model for 3 labels.

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

# Step 5: Set Up Training Arguments and Trainer

In [None]:
training_args = TrainingArguments(
    output_dir='./results',              # Output directory for model predictions and checkpoints
    evaluation_strategy="epoch",         # Evaluate model every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,                  # Train for 3 epochs
    weight_decay=0.01,
    fp16=True,                           # Use mixed precision training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,         # English training dataset
    eval_dataset=val_dataset,            # English validation dataset
)

# Fine-tune the model on the English dataset.
trainer.train()


# Step 6: Evaluate the Model on a French Dataset for Cross-Lingual Transfer

# Download the French version of the XNLI dataset.

In [None]:
french_dataset = load_dataset('xnli', 'fr')

# Tokenize the French dataset using the same function.
tokenized_french_dataset = french_dataset.map(tokenize_function, batched=True)
tokenized_french_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Use the validation portion for evaluation.
french_val_dataset = tokenized_french_dataset['validation']

# Evaluate the model on the French validation set.
results = trainer.evaluate(french_val_dataset)
print("Evaluation results on French dataset:", results)