In [4]:
import torch
from torch.utils.data import Dataset
from transformers import (
    BertTokenizer,
    BertForMaskedLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

# Enable GPU optimizations
torch.backends.cudnn.benchmark = True

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the mBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Define a custom dataset class
class ThirukkuralDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        instruction = item["instruction"]
        input_text = item["input"]
        output_text = item["output"]

        # Combine instruction and input for the model
        combined_input = f"Instruction: {instruction} Input: {input_text} Output: {output_text}"

        # Tokenize the combined input
        inputs = self.tokenizer(
            combined_input,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Return input IDs and attention mask
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": inputs["input_ids"].squeeze(),  # Labels are the same as input for masked LM
        }


# Load the dataset
with open("/teamspace/studios/this_studio/instruction_based_dataset.json", "r") as file:
    data = json.load(file)

# Define hyperparameters
MAX_LENGTH = 64  # Reduced sequence length for faster processing
BATCH_SIZE = 1
EPOCHS = 3
LEARNING_RATE = 4e-5

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2)

# Create datasets
train_dataset = ThirukkuralDataset(train_data, tokenizer, max_length=MAX_LENGTH)
val_dataset = ThirukkuralDataset(val_data, tokenizer, max_length=MAX_LENGTH)

# Data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Load the mBERT model for masked language modeling
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased", ignore_mismatched_sizes=True)
model.to(device)

# Define a function to compute evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Flatten the predictions and labels, ignoring padding tokens (-100)
    preds_flat = preds[labels != -100]
    labels_flat = labels[labels != -100]

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels_flat, preds_flat, average="weighted", zero_division=0
    )
    acc = accuracy_score(labels_flat, preds_flat)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./tamil_llm_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    save_steps=500,
    evaluation_strategy="steps",  # Evaluate every few steps
    eval_steps=100,  # Evaluate every 100 steps
    eval_accumulation_steps=50,  # Process validation data in smaller chunks
    gradient_accumulation_steps=8,  # Simulate larger batch size
    learning_rate=LEARNING_RATE,
    logging_dir="./logs",
    logging_steps=100,
    fp16=torch.cuda.is_available(),  # Enable mixed precision
    load_best_model_at_end=True,  # Load the best model based on evaluation
    metric_for_best_model="accuracy",  # Use accuracy to select the best model
    greater_is_better=True,  # Higher accuracy is better
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_mbert")
tokenizer.save_pretrained("./fine_tuned_mbert")

# Example inference
def generate_response(instruction, input_text):
    model.eval()
    combined_input = f"Instruction: {instruction} Input: {input_text}"
    inputs = tokenizer(
        combined_input,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=50,  # Limit the number of tokens generated
            num_beams=5,  # Use beam search for better results
            early_stopping=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Test the model
instruction = "Explain the meaning of the following Thirukkural."
input_text = "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு."
response = generate_response(instruction, input_text)
print("Response:", response)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,14.9065,1.439189,0.684294,0.674715,0.69404,0.684294
200,11.6159,1.340123,0.702872,0.690627,0.714574,0.702872
300,10.9638,1.249864,0.714481,0.707905,0.733717,0.714481
400,10.6025,1.242978,0.72574,0.715354,0.73696,0.72574
500,9.7712,,0.7338,0.726148,0.744741,0.7338
600,9.5713,1.155597,0.740272,0.728673,0.742143,0.740272
700,9.2748,1.123914,0.745514,0.737189,0.754509,0.745514
800,8.9617,1.080235,0.746746,0.738941,0.753228,0.746746
900,8.9632,1.016715,0.76262,0.755073,0.766221,0.76262
1000,8.4457,1.006001,0.767362,0.758258,0.7658,0.767362


Could not locate the best model at ./tamil_llm_finetuned/checkpoint-1400/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Response: Instruction : Explain the meaning of the following Thirukkural. Input : அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு.irukkural. Output : Translation :'Translation :'Translation :'Translation Thirukkural. Translation Name :'Translation :'Translation :'Translation :'Translation :'Translation :'Translation :'Translation :'Translation : '


In [6]:
# "010225/tamil_llm_finetuned/checkpoint-1596"

from safetensors.torch import load_file

# Load the safetensors file
file_path = "010225/tamil_llm_finetuned/checkpoint-1596/model.safetensors"
state_dict = load_file(file_path)

# Inspect state dict keys
print(state_dict.keys())

# Use the state_dict to initialize a model
from transformers import BertModel, BertConfig

config_path = "010225/tamil_llm_finetuned/checkpoint-1596/config.json"
config = BertConfig.from_pretrained(config_path)
model = BertModel(config)
model.load_state_dict(state_dict)

FileNotFoundError: No such file or directory: "010225/tamil_llm_finetuned/checkpoint-1596/model.safetensors"

In [7]:
from transformers import AutoModel

# Resave model in safetensors format
model = AutoModel.from_pretrained("010225/tamil_llm_finetuned/checkpoint-1596/model.safetensors")
model.save_pretrained("010225/tamil_llm_finetuned", safe_serialization=True)


OSError: Incorrect path_or_model_id: '010225/tamil_llm_finetuned/checkpoint-1596/model.safetensors'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [1]:
model=
def generate_response(instruction, input_text):
    model.eval()
    combined_input = f"Instruction: {instruction} Input: {input_text}"
    inputs = tokenizer(
        combined_input,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=50,  # Limit the number of tokens generated
            num_beams=5,  # Use beam search for better results
            early_stopping=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Test the model
instruction = "Explain the meaning of the following Thirukkural."
input_text = "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு."
response = generate_response(instruction, input_text)
print("Response:", response)

NameError: name 'model' is not defined

In [11]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("./tamil_llm_finetuned/checkpoint-1596")

# Load the model saved in SafeTensors format
model = BertForMaskedLM.from_pretrained("./tamil_llm_finetuned/checkpoint-1596")
model.to(device)

# Define the maximum sequence length
MAX_LENGTH = 64

# Function to generate a response
def generate_response(instruction, input_text):
    model.eval()
    combined_input = f"Instruction: {instruction} Input: {input_text}"
    inputs = tokenizer(
        combined_input,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=50,  # Limit the number of tokens generated
            num_beams=5,  # Use beam search for better results
            early_stopping=True,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
instruction = "Explain the meaning of the following Thirukkural."
input_text = "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு."
response = generate_response(instruction, input_text)
print("Response:", response)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Response: Instruction : Explain the meaning of the following Thirukkural. Input : அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு.irukkural. Output : Translation :'Translation :'Translation :'Translation Thirukkural. Translation Name :'Translation :'Translation :'Translation :'Translation :'Translation :'Translation :'Translation :'Translation : '


In [12]:
print(model)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

bert_tokenizer = AutoTokenizer.from_pretrained("bikram22pi7/distilbert-base-multilingual-cased-on-custom-kural-500")
bert_model = AutoModelForSequenceClassification.from_pretrained("bikram22pi7/distilbert-base-multilingual-cased-on-custom-kural-500")

In [4]:
bert_model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the model and tokenizer
model_name = "bikram22pi7/distilbert-base-multilingual-cased-on-custom-kural-500"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Input a Thirukkural verse
verse = "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு"

# Tokenize and predict
inputs = tokenizer(verse, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)

# Get predicted class
predicted_class = torch.argmax(outputs.logits).item()
categories = ["Virtue", "Wealth", "Love"]
print(f"Predicted Category: {categories[predicted_class]}")


Predicted Category: Virtue


In [10]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the model for feature extraction
model = AutoModel.from_pretrained("bikram22pi7/distilbert-base-multilingual-cased-on-custom-kural-500")
tokenizer = AutoTokenizer.from_pretrained("bikram22pi7/distilbert-base-multilingual-cased-on-custom-kural-500")

# Encode verses and query
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

query = "Honesty is important"
query_embedding = get_embeddings(query)

# Example Thirukkural verses
verses = [
    "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு",
    "அன்பும் அறனும் உடைத்தாயின் இல்வாழ்க்கை பண்பும் பயனும் அது",
    "கல்லாதான் கண்இல்லான் ஆகும் அறிவிலான் அல்லார் மனைவிழியூஉம்"
]
verse_embeddings = [get_embeddings(verse) for verse in verses]

# Compute similarities
similarities = [cosine_similarity(query_embedding, emb)[0][0] for emb in verse_embeddings]
most_similar = verses[similarities.index(max(similarities))]
print(f"Query: {query}")
print(f"Most Similar Verse: {most_similar}")


Query: Honesty is important
Most Similar Verse: அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு


In [11]:
verse1 = "அகர முதல எழுத்தெல்லாம் ஆதி பகவன் முதற்றே உலகு"
verse2 = "அன்பும் அறனும் உடைத்தாயின் இல்வாழ்க்கை பண்பும் பயனும் அது"

embedding1 = get_embeddings(verse1)
embedding2 = get_embeddings(verse2)

similarity = cosine_similarity(embedding1, embedding2)[0][0]
print(f"Similarity Score: {similarity}")


Similarity Score: 0.7140578031539917


In [12]:
from transformers import pipeline

# Load the QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Example context and question
context = "அன்பும் அறனும் உடைத்தாயின் இல்வாழ்க்கை பண்பும் பயனும் அது"
question = "What is the benefit of family life?"

# Get answer
result = qa_pipeline(question=question, context=context)
print(f"Answer: {result['answer']}")


Device set to use cpu
The model 'DistilBertModel' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FalconForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'GPTNeoXForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswering', 'LiltForQuestionA

KeyError: 'start_logits'