In [17]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
import torch

# Load the dataset
data = pd.read_csv('/content/politics_qa.csv')

# Check for required columns
if 'question' not in data.columns or 'answer' not in data.columns:
    raise ValueError("The CSV must contain 'question' and 'answer' columns.")

# Create a general context for questions
context = "This document discusses various political issues and positions in the 2024 elections, including economic policies, healthcare, and reform initiatives."

# Prepare dataset for Hugging Face
def prepare_data(data):
    questions = data['question'].tolist()
    answers = data['answer'].tolist()

    # Use the same context for all questions
    contexts = [context] * len(questions)

    return {
        'question': questions,
        'context': contexts,
        'answers': [{'text': ans, 'start': context.find(ans)} for ans in answers]
    }

# Create dataset
dataset_dict = prepare_data(data)
dataset = Dataset.from_dict(dataset_dict)

# Split the dataset into training and validation sets
train_test_split = dataset.train_test_split(test_size=0.2)  # 80% train, 20% validation

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Tokenize the inputs
def tokenize_function(examples):
    tokenized = tokenizer(
        examples['question'], examples['context'], truncation=True, padding='max_length', max_length=512
    )

    start_positions = []
    end_positions = []
    for i in range(len(examples['answers'])):
        start = examples['answers'][i]['start']
        end = start + len(examples['answers'][i]['text'])
        start_positions.append(start)
        end_positions.append(end)

    tokenized['start_positions'] = start_positions
    tokenized['end_positions'] = end_positions
    return tokenized

# Tokenize the train and validation sets
tokenized_train = train_test_split['train'].map(tokenize_function, batched=True)
tokenized_val = train_test_split['test'].map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',  # Use eval_strategy instead of evaluation_strategy
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,  # Provide the eval dataset here
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    # Handle case where answer may not be found
    if answer_start >= answer_end:
        return "No answer found"

    # Convert token IDs to string
    answer_tokens = inputs['input_ids'][0][answer_start:answer_end]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))
    return answer.strip()

# Example usage
if __name__ == "__main__":
    question = "What are the main issues in the 2024 election?"
    answer = answer_question(question, context)
    print("Answer:", answer)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,2.456911
2,No log,2.813792
3,No log,2.34287


Answer: [CLS] what are the main issues in the 2024 election ? [SEP] this document discusses various political issues and positions in the 2024 elections
