In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file
data = pd.read_csv("./QA_Punjabi.csv")

# Inspect the data
print(data.head())

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)


                                             context  \
0  ??????? ??? ???? ??? ???????? ????? ??? ???? ?...   
1  ??????? ? ?? ??? ??? ???? ?????? ?? ????? ?? ?...   
2  ???????? ???? ?? ???? ???? ???? ?????? ?? ??? ...   
3  ???-??? ??? ?? ???? ?? ??????? ??? ???-??? ???...   
4  ?????, ?????????, ??? ???? ??????? ?? ????? ??...   

                                            question  \
0                 ??????? ?? ???? ??? ?? ???? ?? ???   
1  ??????? ? ?? ??? ??? ????? ???? ????? ?? ??? ?...   
2  ???????? ??? ?? ?????? ?? ???? ?? ??? ?? ?????...   
3  ???? ???-??? ??? ?? ???? ??? ??????? ??? ???-?...   
4  ??????? ???? ????? ?????? ?? ???? ??? ???? ?? ...   

                                             answers  
0  ??????? ??? ???? ??? ???????? ????? ??? ???? ?...  
1  ??????? ? ?? ??? ??? ???? ?????? ?? ????? ?? ?...  
2  ???????? ???? ?? ???? ???? ???? ?????? ?? ??? ...  
3  ???-??? ??? ?? ???? ?? ??????? ??? ???-??? ???...  
4  ?????, ?????????, ??? ???? ??????? ?? ????? ??..

str

In [3]:
from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
import torch

# Load the tokenizer and model
model_name = 'bert-base-multilingual-cased'  # mBERT
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Define a function to preprocess the data
def preprocess_data(examples):
    inputs = tokenizer(
        examples['question'],
        examples['context'],
        max_length=512,
        truncation=True,
        padding='max_length',
        return_offsets_mapping=True
    )
    
    # Find the start and end positions of the answer in the context
    start_positions = []
    end_positions = []
    for i in range(len(examples['Answer'])):
        answer = examples['Answer'].strip()
        context = examples['Context'][i]
        question = examples['Question'][i]
        
        # Find the start position of the answer in the context
        start_idx = context.find(answer)
        if start_idx == -1:
            # If the answer is not found, set to 0 (or handle appropriately)
            start_positions.append(0)
            end_positions.append(0)
        else:
            end_idx = start_idx + len(answer)
            # Tokenize context to find token positions
            tokenized_context = tokenizer(context, return_offsets_mapping=True)
            offsets = tokenized_context['offset_mapping']
            # Find the token that contains the start_idx
            start_token = 0
            end_token = 0
            for idx, (start, end) in enumerate(offsets):
                if start <= start_idx < end:
                    start_token = idx
                if start < end_idx <= end:
                    end_token = idx
                    break
            start_positions.append(start_token)
            end_positions.append(end_token)
    
    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

# Prepare the datasets
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.encodings = preprocess_data(dataframe)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

train_dataset = QADataset(train_data, tokenizer)
val_dataset = QADataset(val_data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
# Train the model
trainer.train()
