In [75]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer

In [47]:
# Load dataset
data_path = "C:\\Users\\Tejaswa Singh\\Desktop\\QA_Punjabi.csv"
df = pd.read_csv(data_path)
dataset = Dataset.from_pandas(df)

In [48]:
# Load tokenizer
model_name = "bert-base-uncased"  # Replace with your model name if needed
tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [76]:
df

Unnamed: 0,context,question,answers
0,??????? ??? ???? ??? ???????? ????? ??? ???? ?...,??????? ?? ???? ??? ?? ???? ?? ???,??????? ??? ???? ??? ???????? ????? ??? ???? ?...
1,??????? ? ?? ??? ??? ???? ?????? ?? ????? ?? ?...,??????? ? ?? ??? ??? ????? ???? ????? ?? ??? ?...,??????? ? ?? ??? ??? ???? ?????? ?? ????? ?? ?...
2,???????? ???? ?? ???? ???? ???? ?????? ?? ??? ...,???????? ??? ?? ?????? ?? ???? ?? ??? ?? ?????...,???????? ???? ?? ???? ???? ???? ?????? ?? ??? ...
3,???-??? ??? ?? ???? ?? ??????? ??? ???-??? ???...,???? ???-??? ??? ?? ???? ??? ??????? ??? ???-?...,???-??? ??? ?? ???? ?? ??????? ??? ???-??? ???...
4,"?????, ?????????, ??? ???? ??????? ?? ????? ??...",??????? ???? ????? ?????? ?? ???? ??? ???? ?? ...,"?????, ?????????, ??? ???? ??????? ?? ????? ??..."
...,...,...,...
95,???? ???? ??????? ?? ????? ??? ????? ?? ???? ?...,???? ???? ??????? ?? ????? ?? ????? ????? ??? ...,???? ???? ??????? ?? ????? ??? ????? ?? ???? ?...
96,???????? ???? ????? ????? ??? ???? ???? ????? ...,???????? ???? ????? ????? ?? ????? ???? ????? ...,???????? ???? ????? ????? ??? ???? ???? ????? ...
97,????? ???? ?????? ?? ????? ??? ?????? ??? ?? ?...,????? ???? ?????? ?? ????? ??? ?????? ??? ?? ?...,????? ???? ?????? ?? ????? ??? ?????? ??? ?? ?...
98,????? ???? ????? ????? ?? ???????? ???? ????? ...,????? ???? ????? ?? ???????? ???? ????? ????? ...,????? ???? ????? ????? ?? ???????? ???? ????? ...


In [50]:
def tokenize_data(examples):
    # Tokenize the input question and context
    tokenized_inputs = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_first",
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors='pt'
    )

    # Initialize lists for start and end positions
    start_positions = []
    end_positions = []

    for i in range(len(examples['context'])):
        context = examples['context'][i]
        start_pos_list = []
        end_pos_list = []
        
        for answer in examples['answers'][i]:
            start_pos = context.find(answer)
            if start_pos != -1:
                end_pos = start_pos + len(answer)
                # Tokenize the answer span
                start_token = tokenizer.encode(context[:start_pos], add_special_tokens=False)
                end_token = tokenizer.encode(context[:end_pos], add_special_tokens=False)
                
                start_pos_list.append(len(start_token))
                end_pos_list.append(len(end_token) - 1)
            else:
                start_pos_list.append(0)
                end_pos_list.append(0)

        # Assign the list of start and end positions to each example
        start_positions.append(start_pos_list[0] if start_pos_list else 0)
        end_positions.append(end_pos_list[0] if end_pos_list else 0)

    # Add token start and end positions to tokenized inputs
    tokenized_inputs['start_positions'] = start_positions
    tokenized_inputs['end_positions'] = end_positions
    
    return tokenized_inputs

# Apply the tokenization
tokenized_dataset = dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [54]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Skip evaluation
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)


In [55]:
# Initialize Trainer without eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

In [56]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=39, training_loss=0.09421379749591534, metrics={'train_runtime': 959.0515, 'train_samples_per_second': 0.313, 'train_steps_per_second': 0.041, 'total_flos': 78389027020800.0, 'train_loss': 0.09421379749591534, 'epoch': 3.0})

In [57]:
model_save_path = "C:\\Users\\Tejaswa Singh\\Desktop\\fine-tuned-bert-base-uncased"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('C:\\Users\\Tejaswa Singh\\Desktop\\fine-tuned-bert-base-uncased\\tokenizer_config.json',
 'C:\\Users\\Tejaswa Singh\\Desktop\\fine-tuned-bert-base-uncased\\special_tokens_map.json',
 'C:\\Users\\Tejaswa Singh\\Desktop\\fine-tuned-bert-base-uncased\\vocab.txt',
 'C:\\Users\\Tejaswa Singh\\Desktop\\fine-tuned-bert-base-uncased\\added_tokens.json',
 'C:\\Users\\Tejaswa Singh\\Desktop\\fine-tuned-bert-base-uncased\\tokenizer.json')

In [58]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering
import torch

# Define paths
model_save_path = "C:\\Users\\Tejaswa Singh\\Desktop\\fine-tuned-bert-base-uncased"
test_data_path = "C:\\Users\\Tejaswa Singh\\Desktop\\QA_Punjabi_test.csv"  # Path to your test data

In [59]:
# Load the model and tokenizer
model = BertForQuestionAnswering.from_pretrained(model_save_path)
tokenizer = BertTokenizerFast.from_pretrained(model_save_path)

In [73]:
def answer_question(context, question, model, tokenizer):
    if not question.strip():
        return "ਕਿਰਪਾ ਕਰਕੇ ਸਵਾਲ ਦਾਖਲ ਕਰੋ।"  # "Please enter a question."

    # Tokenize the inputs
    inputs = tokenizer(
        question,
        context,
        add_special_tokens=True,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )

    # Get the model's prediction
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        start_idx = torch.argmax(start_logits)
        end_idx = torch.argmax(end_logits) + 1

        answer = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_idx:end_idx])
        )

    # Handle cases where the model doesn't generate a valid answer
    if answer.strip() == "[CLS]" or not answer.strip():
        return "ਮੁਆਫ਼ ਕਰਨਾ, ਮੈਨੂੰ ਇਸਦਾ ਜਵਾਬ ਨਹੀਂ ਮਿਲਿਆ।"  # "Sorry, I couldn't find an answer."

    return answer


In [74]:
# Example inputs
context = "ਕੱਛੇ ਗੰਦੇ ਕੰਬਲ ਨੂੰ ਸਾਫ਼ ਕਰਨ ਲਈ ਬ੍ਰਿਸਟਲ ਬਰਸ਼ ਦੀ ਵਰਤੋਂ ਕਰਨ ਦੀ ਸਿਫਾਰਸ਼ ਕੀਤੀ ਜਾਂਦੀ ਹੈ। ਇਸ ਨਾਲ ਕੰਬਲ ਦੀ ਸਫਾਈ ਅਤੇ ਲੰਬਾਈ ਨੂੰ ਬਣਾਈ ਰੱਖਣ ਵਿੱਚ ਮਦਦ ਮਿਲਦੀ ਹੈ।"
question = "ਕੱਛੇ ਗੰਦੇ ਕੰਬਲ ਨੂੰ ਕਿਵੇਂ ਸਾਫ਼ ਕੀਤਾ ਜਾ ਸਕਦਾ ਹੈ?"

# Get the answer
predicted_answer = answer_question(context, question, model, tokenizer)

# Display the result
print(f"Question: {question}")
print(f"Answer: {predicted_answer}")

Question: ਕੱਛੇ ਗੰਦੇ ਕੰਬਲ ਨੂੰ ਕਿਵੇਂ ਸਾਫ਼ ਕੀਤਾ ਜਾ ਸਕਦਾ ਹੈ?
Answer: ਮੁਆਫ਼ ਕਰਨਾ, ਮੈਨੂੰ ਇਸਦਾ ਜਵਾਬ ਨਹੀਂ ਮਿਲਿਆ।
