In [1]:
pip install transformers torch datasets accelerate scikit-learn torchvision torchaudio tensorboard pandas 

Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import (
    BertTokenizer,
    BertForMaskedLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

# Enable GPU optimizations
torch.backends.cudnn.benchmark = True

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the mBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Define a custom dataset class
class ThirukkuralDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        instruction = item["instruction"]
        response = item["response"]

        # Combine instruction and response for the model
        combined_input = f"Instruction: {instruction} Response: {response}"

        # Tokenize the combined input
        inputs = self.tokenizer(
            combined_input,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Return input IDs and attention mask
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": inputs["input_ids"].squeeze(),  # Labels are the same as input for masked LM
        }


# Load the dataset
with open("/teamspace/studios/this_studio/dataset/thirukkural.json", "r") as file:
    data = json.load(file)

# Define hyperparameters
MAX_LENGTH = 128  # Adjusted sequence length for the dataset
BATCH_SIZE = 1
EPOCHS = 3
LEARNING_RATE = 5e-5

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2)

# Create datasets
train_dataset = ThirukkuralDataset(data, tokenizer, max_length=MAX_LENGTH)
val_dataset = ThirukkuralDataset(val_data, tokenizer, max_length=MAX_LENGTH)

# Data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Load the mBERT model for masked language modeling
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")
model.to(device)

# Define a function to compute evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the predicted token IDs

    # Flatten the predictions and labels, ignoring padding tokens (-100)
    preds_flat = preds[labels != -100]
    labels_flat = labels[labels != -100]

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels_flat, preds_flat, average="weighted")
    acc = accuracy_score(labels_flat, preds_flat)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./tamil_mbert_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    save_steps=500,
    evaluation_strategy="steps",  # Evaluate every few steps
    eval_steps=100,  # Evaluate every 100 steps
    eval_accumulation_steps=100,  # Process validation data in smaller chunks
    gradient_accumulation_steps=8,  # Simulate larger batch size
    learning_rate=LEARNING_RATE,
    logging_dir="./logs",
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Enable mixed precision
    load_best_model_at_end=True,  # Load the best model based on evaluation
    metric_for_best_model="accuracy",  # Use accuracy to select the best model
    greater_is_better=True,  # Higher accuracy is better
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,10.5156,1.257304,0.715548,0.706592,0.728279,0.715548
200,9.7982,1.152757,0.735868,0.727464,0.743693,0.735868
300,9.381,1.106688,0.74004,0.731312,0.748397,0.74004
400,9.9696,0.990594,0.764145,0.757738,0.771333,0.764145


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Could not locate the best model at ./tamil_mbert_finetuned/checkpoint-400/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=498, training_loss=10.054775253357178, metrics={'train_runtime': 235.8885, 'train_samples_per_second': 16.915, 'train_steps_per_second': 2.111, 'total_flos': 261633949516800.0, 'train_loss': 10.054775253357178, 'epoch': 2.9864661654135336})

In [2]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("./tamil_mbert_finetuned/checkpoint-498")

# Load the model saved in SafeTensors format
model = BertForMaskedLM.from_pretrained("./tamil_mbert_finetuned/checkpoint-498")
model.to(device)

# Define the maximum sequence length
MAX_LENGTH = 128

# Function to generate a response
def generate_response(instruction, input_text):
    model.eval()
    combined_input = f"Instruction: {instruction} Input: {input_text}"
    inputs = tokenizer(
        combined_input,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=50,  # Limit the number of tokens generated
            num_beams=5,  # Use beam search for better results
            early_stopping=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
instruction = "Explain the meaning of the following Thirukkural."
input_text = "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு."
response = generate_response(instruction, input_text)
print("Response:", response)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Response: Instruction : Explain the meaning of the following Thirukkural. Input : அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு. following : The world is the world of the world ; wiseness is the wiseness ; the world is the world of the world ; the world is the world itself ; the world is the world of the world ; the world is the world itself '


In [3]:
def prepare_input(example):
    instruction = example["instruction"]
    response = example["response"]
    combined_input = f"Instruction: {instruction} Response: {response}"
    
    # Tokenize the input
    inputs = tokenizer(
        combined_input,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    return inputs
    
def test_model(example):
    inputs = prepare_input(example)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=-1)  # Get the predicted token IDs

    # Decode the predictions
    predicted_text = tokenizer.decode(predictions[0], skip_special_tokens=True)
    return predicted_text

In [12]:
from sklearn.metrics import accuracy_score
import json
from sklearn.model_selection import train_test_split

# Load the dataset
with open("/teamspace/studios/this_studio/dataset/thirukkural.json", "r") as file:
    data = json.load(file)
train_data, val_data = train_test_split(data, test_size=0.2)
print(len(val_data))
val = val_data[1]
# Evaluate the model on the test dataset
for example in val_data(1):
    predicted_response = test_model(example)
    print(f"Instruction: {example['instruction']}")
    print(f"Expected Response: {example['response']}")
    print(f"Predicted Response: {predicted_response}")
    print("-" * 50)

266


TypeError: 'list' object is not callable

In [13]:
example = val_data[0]  # Access the first example
predicted_response = test_model(example)  # Replace `test_model` with the actual function
print(f"Instruction: {example['instruction']}")
print(f"Expected Response: {example['response']}")
print(f"Predicted Response: {predicted_response}")
print("-" * 50)

Instruction: Provide all details about Kural number 544.
Expected Response: {'Number': '544', 'kural': 'குடிதழீஇக் கோலோச்சும் மாநில மன்னன் அடிதழீஇ நிற்கும் உலகு.', 'mk': 'குடிமக்களை அரவணைத்து ஆட்சி நடத்தும் நல்லரசின் அடிச்சுவட்டை நானிலமே போற்றி நிற்கும்', 'explanation': 'The world will constantly embrace the feet of the great king who rules over his subjects with love', 'adikaram_name': 'செங்கோன்மை', 'iyal_name': 'அரசியல்', 'paul_translation': 'Wealth'}
Predicted Response: , Instruction : Provide all details about Kural number 544. Response : {'Number':'544 ','kural':'குடிதழீஇக் கோலோச்சும் மாநில மன்னன் அடிதழீஇ நிற்கும் உலகு. ','mk':'குடிமக்களை அரவணைத்து ஆட்சி நடத்தும் நல்லரசின் அடிச்சுவட்டை நானிலமே போற்றி நிற்கும் ','explanation':'The world will constantly embrace the feet of.
--------------------------------------------------


In [11]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting lxml (from sacrebleu)
  Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m180.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: tabulate, portalocker, lxml, colorama, sa

In [13]:
import sacrebleu

# Prepare references and predictions
references = [str(example["response"]) for example in val_data]
predictions = [test_model(example) for example in val_data]

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(predictions, [references])
print(f"BLEU Score: {bleu.score}")

BLEU Score: 39.48209712011064


In [15]:
# Load the base model
base_model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

# Test the base model
for example in val_data:
    predicted_response = test_model(example)
    print(f"Base Model Prediction: {predicted_response}")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Base Model Prediction: , Instruction : Provide all details about Kural number 894. Response : {'Number':'894 ','kural':'கூற்றத்தைக் கையால் விளித்தற்றால் ஆற்றுவார்க்கு ஆற்றாதார் இன்னா செயல். ','mk':'எந்தத் துன்பத்தையும் தாங்கக் கூடிய ஆற்றல் படைத்தவர்களுடன், சிறு துன்பத்தையும் தாங்க முடியாதவர்கள் மோதினால் அவர்களே தங்களின் முடிவுகாலத்தைக்.
Base Model Prediction: , Instruction : Provide all details about Kural number 116. Response : {'Number':'116 ','kural':'கெடுவல்யான் என்பது அறிகதன் நெஞ்சம் நடுவொரீஇ அல்ல செயின். ','mk':'நடுவுநிலைமை தவறிச் செயல்படலாம் என்று ஒரு நினைப்பு ஒருவனுக்கு வந்து விடுமானால் அவன் கெட்டொழியப் போகிறான் என்று அவனுக்கே தெரியவேண்டும் ',.
Base Model Prediction: , Instruction : Provide all details about Kural number 527. Response : {'Number':'527 ','kural':'காக்கை கரவா கரைந்துண்ணும் ஆக்கமும் அன்னநீ ரார்க்கே உள. ','mk':'தனக்குக் கிடைத்ததை மறைக்காமல் தனது சுற்றத்தைக் கூவி அழைத்துக் காக்கை உண்ணும் அந்தக் குணம் உடையவர்களுக்கு மட்டுமே உலகில் உயர்வு உண்டு ','explanation':'The.
B

In [15]:
# Load the base model
base_model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

# Test the base model
# for example in val_data:
    # predicted_response = test_model(example)
    # print(f"Base Model Prediction: {predicted_response}")

examp = val_data[0]
predicted_response = test_model(example)
print(predicted_response)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


, Instruction : Provide all details about Kural number 722. Response : {'Number':'722 ','kural':'கற்றாருள் கற்றார் எனப்படுவர் கற்றார்முன் கற்ற செலச்சொல்லு வார். ','mk':'கற்றவரின் முன் தாம் கற்றவற்றை அவருடைய மனத்தில் பதியுமாறு சொல்ல வல்லவர், கற்றவர் எல்லாரினும் மேலானவராக மதித்துச் சொல்லப்படுவார் ','explanation':.


In [16]:
# Simplified input
simplified_input = "Provide details about Kural number 1."
inputs = tokenizer(
    simplified_input,
    max_length=128,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

# Pass the input to the model
inputs = inputs.to(device)
with torch.no_grad():
    outputs = model(**inputs)

# Decode the output
predicted_token_ids = outputs.logits.argmax(dim=-1)
predicted_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)
print("Predicted Text:", predicted_text)

Predicted Text: . Provide details about Kural number 1. Pro Pro Pro Pro Pro Pro Pro Provide about : :al number.. Response. Pro Pro Pro Pro Pro Pro Provide all about :al number 1. Pro Pro Pro Pro Pro Pro Pro Provide all about : about numberal number.. Pro Pro Pro Pro Pro Pro Provide all about :al number number. Pro Pro Pro Pro Pro Pro Pro Provide allvide all about Kural number 1. Pro Pro Pro Pro Pro Pro Provide all about aboutal number Number.. Pro. Pro Pro Pro Pro Pro Pro Pro Pro Pro about Kural number 1. Pro


In [2]:
from transformers import pipeline
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
ft_tokenizer = BertTokenizer.from_pretrained("./tamil_mbert_finetuned/checkpoint-498")

# Load the model saved in SafeTensors format
ft_model = BertForMaskedLM.from_pretrained("./tamil_mbert_finetuned/checkpoint-498")
ft_model.to(device)

# Load the QA pipeline
qa_pipeline = pipeline("question-answering", model=ft_model, tokenizer=ft_tokenizer)

# Example context and question
context = "அன்பும் அறனும் உடைத்தாயின் இல்வாழ்க்கை பண்பும் பயனும் அது"
question = "What is the benefit of family life?"

# Get answer
result = qa_pipeline(question=question, context=context)
print(f"Answer: {result['answer']}")


Device set to use cpu
The model 'BertForMaskedLM' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FalconForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'GPTNeoXForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswering', 'LiltForQuestionA

KeyError: 'start_logits'