# Language classification model using the XLM-RoBERTa transformer.

Load packages

In [1]:
import pandas as pd
from transformers import pipeline, XLMRobertaForSequenceClassification, XLMRobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


Load and Filter Dataset:

Load the dataset and filter it to include only selected languages (English, Spanish, French, German, and Italian). Take quick sample of the filtered dataset for prototyping.

In [2]:
# Step 1: Load and preprocess the dataset
# Load the dataset and filter for the target languages
df = pd.read_csv("sentences.csv", sep="\t", header=None, names=["id", "lang", "text"])
target_languages = ["eng", "spa", "fra", "deu", "ita"]
df_filtered = df[df['lang'].isin(target_languages)]

# Show select sample
df_sample = df_filtered.sample(1000)
print("Sample of the dataset:\n", df_sample.head())

Sample of the dataset:
                 id lang                                          text
5147054    5499292  eng           It's you who should be thanking me.
575746      604688  deu  Oft enthalten Scherze ein Körnchen Wahrheit.
606774      637447  spa                                     ¡Me cago!
7000522    7395789  eng             He said that he was really lucky.
10119196  10576930  spa                    En París se habla francés.


Prepare Data for Training: 

Extract texts and labels, then split the data into training and test sets.

In [3]:
# Step 2: Split data for training and testing
texts = df_sample['text'].tolist()
labels = df_sample['lang'].tolist()

# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

Load Model:

Load the XLM-RoBERTa model for text classification.

In [4]:
# Step 3: Load the pre-trained classification pipeline for initial testing
classifier = pipeline("text-classification", model="xlm-roberta-base")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Make Predictions:

Predict the language on the test set and evaluate results.

In [5]:
# Step 4: Get initial predictions for the test set
predictions = classifier(test_texts)

# Extract predicted labels
predicted_labels = [prediction['label'] for prediction in predictions]

In [6]:
# Evaluate the initial performance
print("Initial Model Accuracy:", accuracy_score(test_labels, predicted_labels))
print("Classification Report:\n", classification_report(test_labels, predicted_labels))

Initial Model Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

     LABEL_1       0.00      0.00      0.00       0.0
         deu       0.00      0.00      0.00      29.0
         eng       0.00      0.00      0.00      83.0
         fra       0.00      0.00      0.00      32.0
         ita       0.00      0.00      0.00      35.0
         spa       0.00      0.00      0.00      21.0

    accuracy                           0.00     200.0
   macro avg       0.00      0.00      0.00     200.0
weighted avg       0.00      0.00      0.00     200.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# Define the mapping from label to actual language
label_to_language = {
    'LABEL_0': 'English',
    'LABEL_1': 'Spanish',
    'LABEL_2': 'French',
    'LABEL_3': 'German',
    'LABEL_4': 'Italian'
}

# Step 5: Test the model with example sentences
# Predict the language of example sentences to verify the model's performance on known cases
example_sentences = [
    "This is a test sentence in English.",
    "Esta es una frase de prueba en español.",
    "C'est une phrase de test en français.",
    "Dies ist ein Testsatz auf Deutsch.",
    "Questa è una frase di prova in italiano."
]

# Predict the language and map the label to actual language name
print("Testing example sentences:")
for sentence in example_sentences:
    prediction = classifier(sentence)
    predicted_language = label_to_language[prediction[0]['label']]
    print(f"Sentence: {sentence}")
    print(f"Predicted Language: {predicted_language}\n")

Testing example sentences:
Sentence: This is a test sentence in English.
Predicted Language: Spanish

Sentence: Esta es una frase de prueba en español.
Predicted Language: Spanish

Sentence: C'est une phrase de test en français.
Predicted Language: Spanish

Sentence: Dies ist ein Testsatz auf Deutsch.
Predicted Language: Spanish

Sentence: Questa è una frase di prova in italiano.
Predicted Language: Spanish



Load and prepare multilingual training dataset:

Load dataset (PAWS-X) for training and testing the model, and apply tokenization.

In [8]:
# Step 6: Load PAWS-X for English, Spanish, German, French, and Italian
dataset_en = load_dataset('xtreme', 'PAWS-X.en')
dataset_es = load_dataset('xtreme', 'PAWS-X.es')
dataset_de = load_dataset('xtreme', 'PAWS-X.de')
dataset_fr = load_dataset('xtreme', 'PAWS-X.fr')
dataset_it = load_dataset('xtreme', 'PAN-X.it')

# Concatenate the datasets
train_dataset = concatenate_datasets([dataset_en['train'], dataset_es['train'], dataset_de['train'], dataset_fr['train'], dataset_it['train']])
test_dataset = concatenate_datasets([dataset_en['test'], dataset_es['test'], dataset_de['test'], dataset_fr['test'], dataset_it['test']])

Tokenize dataset

In [9]:
# Initialize the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenization function
def tokenize(batch):
    # Concatenate sentence1 and sentence2, replacing None values with an empty string
    combined_sentences = [(s1 or "") + " " + (s2 or "") for s1, s2 in zip(batch['sentence1'], batch['sentence2'])]
    return tokenizer(combined_sentences, padding="max_length", truncation=True, max_length=128)

Apply tokenization to train and test datasets

In [10]:
train_data = train_dataset.map(tokenize, batched=True)
test_data = test_dataset.map(tokenize, batched=True)

Convert Labels to Integer Format

Filter out samples with None labels and convert labels to integers.

In [11]:
# Convert labels to integers and remove null labels
train_data = train_data.filter(lambda x: x['label'] is not None)
test_data = test_data.filter(lambda x: x['label'] is not None)

# Convert labels to integers for PyTorch compatibility
train_data = train_data.map(lambda x: {'label': int(x['label'])})
test_data = test_data.map(lambda x: {'label': int(x['label'])})

Set format for PyTorch:

Prepare the dataset for use with PyTorch.

In [12]:
# Set format for PyTorch
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Now train_data and test_data are ready to be used with Trainer.

First, 
Set up the model and specify training arguments.

In [13]:
# Step 7: Fine-tune the model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=5)  # For the five languages

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define Training Arguments

In [24]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="no",  # Optional: turn off intermediate evaluations
    learning_rate=5e-5,  # Increased slightly for faster convergence
    per_device_train_batch_size=32,  # Larger batch size
    per_device_eval_batch_size=32,   # Match eval batch size to train batch size
    num_train_epochs=0.5,  # Fewer epochs
    weight_decay=0.01,
    fp16=True,  # Enable mixed-precision training if supported
)



Create the Trainer:

Initialize the Trainer object.

In [25]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)

Train and Evaluate Model: 

Finally, Train the model and evaluate its performance.

In [23]:
# Train the model
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

Evaluate the Model

In [None]:
# Evaluate the model on the test set
trainer.evaluate()

Test with New Sentences:

In [None]:
# Step 8: Test the model on new sentences after training
def predict_language(sentences):
    for sentence in sentences:
        prediction = trainer.predict([sentence])
        label = prediction.predictions.argmax()  # Get label with the highest probability
        language = label_to_language[f'LABEL_{label}']
        print(f"Sentence: {sentence}\nPredicted Language: {language}\n")

# Testing new sentences with the fine-tuned model
predict_language(example_sentences)

Save the Model:

In [None]:
model.save_pretrained()