# Multilingual Text Classifier Using Large Language Models (LLMs)

In [6]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../sentences.csv", sep="\t", header=None, names=["id", "lang", "text"])

# Filter dataset for a few languages (e.g., English, Spanish, French, German, Italian)
languages = ["eng", "spa", "fra", "deu", "ita"]
df_filtered = df[df['lang'].isin(languages)]

# Select a sample for quick prototyping
df_sample = df_filtered.sample(1000)
print(df_sample.head())

              id lang                                               text
6290525  6675225  deu                                 Tom bekommt keine.
7977017  8398033  fra  Un jour de fête, comme il avait préparé chez l...
9229231  9678684  deu   Tom schickt mir jedes Jahr eine Weihnachtskarte.
15241      15970  eng              Can you break away from your parents?
7957564  8378247  eng   I hope that Tom is ready to go when I get there.


In [1]:
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [7]:
texts = df_sample['text'].tolist()
labels = df_sample['lang'].tolist()

# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [8]:
classifier = pipeline("text-classification", model="xlm-roberta-base")

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [9]:
predictions = classifier(test_texts)

# Extract predicted labels
predicted_labels = [prediction['label'] for prediction in predictions]

Model Evaluation

In [10]:
print("Accuracy:", accuracy_score(test_labels, predicted_labels))
print("Classification Report:\n", classification_report(test_labels, predicted_labels))

Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

     LABEL_0       0.00      0.00      0.00       0.0
     LABEL_1       0.00      0.00      0.00       0.0
         deu       0.00      0.00      0.00      30.0
         eng       0.00      0.00      0.00      76.0
         fra       0.00      0.00      0.00      40.0
         ita       0.00      0.00      0.00      37.0
         spa       0.00      0.00      0.00      17.0

    accuracy                           0.00     200.0
   macro avg       0.00      0.00      0.00     200.0
weighted avg       0.00      0.00      0.00     200.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from transformers import pipeline

# Load the model
classifier = pipeline("text-classification", model="xlm-roberta-base")

# Define the mapping from label to actual language
label_to_language = {
    'LABEL_0': 'English',
    'LABEL_1': 'Spanish',
    'LABEL_2': 'French',
    'LABEL_3': 'German',
    'LABEL_4': 'Italian'
}

# Example sentences to test the classifier
sentences = [
    "This is a test sentence in English.",
    "Esta es una frase de prueba en español.",
    "C'est une phrase de test en français.",
    "Dies ist ein Testsatz auf Deutsch.",
    "Questa è una frase di prova in italiano."
]

# Predict the language and map the label to actual language name
for sentence in sentences:
    prediction = classifier(sentence)
    label = prediction[0]['label']
    language = label_to_language[label]
    
    print(f"Sentence: {sentence}")
    print(f"Predicted Language: {language}\n")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentence: This is a test sentence in English.
Predicted Language: Spanish

Sentence: Esta es una frase de prueba en español.
Predicted Language: Spanish

Sentence: C'est une phrase de test en français.
Predicted Language: Spanish

Sentence: Dies ist ein Testsatz auf Deutsch.
Predicted Language: Spanish

Sentence: Questa è una frase di prova in italiano.
Predicted Language: Spanish



## Fine-tuning

In [18]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Loading PAWS-X for English, Spanish and Yoruba
# Load PAWS-X for English and Spanish
dataset_en = load_dataset('xtreme', 'PAWS-X.en')
dataset_es = load_dataset('xtreme', 'PAWS-X.es')

# Load Tatoeba for Yoruba
dataset_yo = load_dataset('tatoeba', 'tatoeba.yo', trust_remote_code=True)


# Split dataset into train and test
train_data, test_data = train_test_split(dataset['train'], test_size=0.2)

# Load the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch['sentence'], padding=True, truncation=True)

train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

# Set format for PyTorch
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train-00000-of-00001.parquet:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/296k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/298k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49175 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.57M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/326k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/332k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1961 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

NameError: name 'dataset' is not defined