In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load pretrained BERT model and tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


In [None]:
import pandas as pd

# Load train and test datasets
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Inspect the data
print(train_data.head())
print(test_data.head())


In [None]:
# Inspect rows with missing answers

print(f"Number of training samples: {len(train_data)}")
print(f"Number of test samples: {len(test_data)}")


In [None]:
import difflib

def find_closest_answer(context, answer, threshold=0.6):
    words = context.split()  # Tokenize context into words
    closest_match = None
    max_similarity = 0
    
    for i in range(len(words)):
        for j in range(i + 1, len(words) + 1):
            substring = " ".join(words[i:j])
            similarity = difflib.SequenceMatcher(None, substring, answer).ratio()
            if similarity > max_similarity:
                max_similarity = similarity
                closest_match = (substring, i, j - 1)  # Save token indices
    
    return closest_match if max_similarity >= threshold else None


In [None]:
from transformers import AutoTokenizer

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

def preprocess_data(data):
    inputs = []
    for _, row in data.iterrows():
        context = row['context']
        question = row['question']
        answer = row['answer']

        encoded = tokenizer(
            context,
            question,
            max_length=512,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # Use approximate matching
        match = find_closest_answer(context, answer)
        if not match:
            print(f"Answer '{answer}' not found even with approximate matching!")
            continue

        substring, start_word_idx, end_word_idx = match
        start_char_idx = context.find(substring)
        end_char_idx = start_char_idx + len(substring) - 1

        start_token_idx = encoded.char_to_token(0, start_char_idx)
        end_token_idx = encoded.char_to_token(0, end_char_idx)

        if start_token_idx is None or end_token_idx is None:
            continue

        inputs.append({
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'start_positions': start_token_idx,
            'end_positions': end_token_idx
        })

    return inputs

# Function to preprocess data
# def preprocess_data(data):
#     inputs = []
#     for _, row in data.iterrows():
#         context = row['context']
#         question = row['question']
#         answer = row['answer']

#         print(f"Processing: \nContext: {context[:100]}...\nQuestion: {question}\nAnswer: {answer}")

#         # Tokenize context and question
#         encoded = tokenizer(
#             context,
#             question,
#             max_length=512,
#             truncation=True,
#             padding="max_length",
#             return_tensors="pt"
#         )

#         # Find character indices
#         start_char_idx = context.find(answer)
#         if start_char_idx == -1:
#             print(f"Answer '{answer}' not found in context!")
#             continue

#         end_char_idx = start_char_idx + len(answer) - 1

#         # Map character indices to token indices
#         start_token_idx = encoded.char_to_token(0, start_char_idx)
#         end_token_idx = encoded.char_to_token(0, end_char_idx)

#         if start_token_idx is None or end_token_idx is None:
#             print(f"Token indices not found for answer: {answer}")
#             continue

#         inputs.append({
#             'input_ids': encoded['input_ids'].squeeze(),
#             'attention_mask': encoded['attention_mask'].squeeze(),
#             'start_positions': start_token_idx,
#             'end_positions': end_token_idx
#         })

#     print(f"Number of valid samples: {len(inputs)}")

#     return inputs



In [None]:
# for _, row in train_data.iterrows():
#     context = row['context']
#     answer = row['answer']
#     if context.find(answer) == -1:
#         print(f"Context: {context[:100]}...\nAnswer: {answer}\n")


In [None]:
# Apply preprocessing
train_encodings = preprocess_data(train_data)
test_encodings = preprocess_data(test_data)


In [None]:
train_encodings

In [None]:
import torch
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings[idx]['input_ids'],
            'attention_mask': self.encodings[idx]['attention_mask'],
            'start_positions': torch.tensor(self.encodings[idx]['start_positions']),
            'end_positions': torch.tensor(self.encodings[idx]['end_positions'])
        }
        return item

# Create datasets
train_dataset = QADataset(train_encodings)
test_dataset = QADataset(test_encodings)


In [None]:
train_dataset

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Load pretrained BERT model
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()


In [None]:
def predict(context, question):
    inputs = tokenizer(
        context,
        question,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Get model predictions
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Find the start and end positions
    start_idx = torch.argmax(start_logits)
    end_idx = torch.argmax(end_logits) + 1

    # Decode the predicted tokens
    predicted_answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx])
    )
    return predicted_answer

# Example
context = "Hugging Face is creating tools for NLP and machine learning."
question = "What is Hugging Face creating?"
print(predict(context, question))
