In [14]:
!pip install transformers datasets tokenizers accelerate



In [15]:
from datasets import load_dataset, Dataset
from transformers import (
    BertConfig, BertTokenizerFast, BertForSequenceClassification,
    TrainingArguments, Trainer
)
from tokenizers import ByteLevelBPETokenizer
import os

In [16]:
raw_ds = load_dataset("papluca/language-identification")
raw_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 70000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
})

In [17]:
TARGET_LANGS = ["en", "fr", "es", "de", "hi", "ar"]

filtered_texts = []
filtered_labels = []

for item in raw_ds["train"]:
    if item["labels"] in TARGET_LANGS:
        filtered_texts.append(item["text"])
        filtered_labels.append(TARGET_LANGS.index(item["labels"]))

len(filtered_texts)

21000

In [18]:
# -------------------------------------------------------------
# EXTRA: Add transliterated Hindi examples (Latin-script Hindi)
# -------------------------------------------------------------
extra_hindi_texts = [
    "Kutta sundar hai",
    "Mera naam Irfan hai",
    "Tum kaise ho",
    "Aaj mausam accha hai",
    "Bahut acha lagta hai",
    "Mujhe nahin pata",
    "Kya kar rahe ho",
    "Main theek hoon",
    "Yeh bahut important hai",
    "Mujhe Hindi bahut pasand hai",
    "Tumhara ghar kahan hai",
    "Kal milte hain",
    "Aaj khaane mein kya hai",
    "Mujhe pani chahiye",
    "Yeh kutta bada sundar hai",
]

for line in extra_hindi_texts:
    filtered_texts.append(line)
    filtered_labels.append(TARGET_LANGS.index("hi"))
# -------------------------------------------------------------
# EXTRA: Add transliterated Arabic examples (Arabizi / Franco-Arabic)
# -------------------------------------------------------------
extra_arabic_texts = [
    "el kalb gameel",
    "ana bahibbak",
    "enta fein",
    "aljaw helw elnaharda",
    "ana taaban geddan",
    "mish fahim",
    "mashy tamam",
    "al ayam di saaba shwaya",
    "inta betaamel eh",
    "mish aaref",
    "ana jaii delwa2ti",
    "mafeesh moshkela khalis",
    "kwayyes awi",
    "el makan da gamed",
    "el wad3 mashy tamam",
]

for line in extra_arabic_texts:
    filtered_texts.append(line)
    filtered_labels.append(TARGET_LANGS.index("ar"))


In [19]:
from datasets import ClassLabel, Features

# Convert labels to ClassLabel type
num_classes = len(TARGET_LANGS)
features = Features({'text': raw_ds['train'].features['text'], 'labels': ClassLabel(num_classes=num_classes, names=TARGET_LANGS)})

dataset = Dataset.from_dict({"text": filtered_texts, "labels": filtered_labels}, features=features)
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="labels")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 16824
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 4206
    })
})

In [20]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(
    "bert-base-multilingual-cased"
)

In [21]:
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

tokenized_ds = dataset.map(tokenize_batch, batched=True)
tokenized_ds

Map:   0%|          | 0/16824 [00:00<?, ? examples/s]

Map:   0%|          | 0/4206 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16824
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4206
    })
})

In [22]:
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=128,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=256,
    max_position_embeddings=128,
    type_vocab_size=1,
    num_labels=len(TARGET_LANGS)
)

model = BertForSequenceClassification(config)
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 128, padding_idx=0)
      (position_embeddings): Embedding(128, 128)
      (token_type_embeddings): Embedding(1, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-

In [23]:
tokenized_ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

In [24]:
training_args = TrainingArguments(
    output_dir="./bert_lang_detect",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=3e-4,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer
)

  trainer = Trainer(


In [26]:
!pip uninstall -y wandb
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

[0m

Epoch,Training Loss,Validation Loss
1,0.0139,0.028741
2,0.0057,0.0274
3,0.0013,0.017661
4,0.0009,0.018756
5,0.0008,0.018934


TrainOutput(global_step=2630, training_loss=0.06341144352046482, metrics={'train_runtime': 98.9897, 'train_samples_per_second': 849.785, 'train_steps_per_second': 26.568, 'total_flos': 17684161320960.0, 'train_loss': 0.06341144352046482, 'epoch': 5.0})

In [27]:
results = trainer.evaluate()
results

{'eval_loss': 0.017661238089203835,
 'eval_runtime': 2.9937,
 'eval_samples_per_second': 1404.966,
 'eval_steps_per_second': 44.093,
 'epoch': 5.0}

In [29]:
def predict_lang(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # NEW: move tokens to same device as model
    device = model.device
    tokens = {k: v.to(device) for k, v in tokens.items()}

    outputs = model(**tokens)
    pred = outputs.logits.argmax(dim=-1).item()
    return TARGET_LANGS[pred]


print(predict_lang("Der Hund ist schön"))                 # → de
print(predict_lang("dubee ek mahaan shahar hai"))         # → hi
print(predict_lang("الطقس جميل اليوم"))                   # → ar
print(predict_lang("Ismi Irfan"))                         # → ar
print(predict_lang("I love transformers"))                # → en
print(predict_lang("मेरा निवास दुबई में है"))                     # → hi

de
hi
ar
ar
en
hi
