In [None]:
pip install transformers datasets evaluate pandas

In [None]:
from datasets import load_dataset

squad2 = load_dataset("squad_v2", split="train[:]")

In [None]:
squad2 = squad2.train_test_split(test_size=0.2)

In [None]:
squad2["train"][0]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("timpal0l/mdeberta-v3-base-squad2")

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        # Check if the question has no answerable context in SQuAD v2
        if len(answers[i]["text"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise, it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [None]:
tokenized_squad2 = squad2.map(preprocess_function, batched=True, remove_columns=squad2["train"].column_names)

In [None]:
tokenized_squad2["train"]

In [None]:
train_dataset = tokenized_squad2["train"]

In [None]:
import tensorflow as tf
from keras import layers

def build_simple_qa_model(max_seq_length, vocab_size, embedding_dim, num_classes):
    # Input layers
    input_ids = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32, name="attention_mask")

    # Embedding layer
    embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_ids)

    # Transformer layers (simplified)
    transformer_output = layers.LSTM(64, return_sequences=True)(embedding)

    # Question-Answering head
    qa_outputs = layers.Dense(num_classes, activation='softmax', name="qa_outputs")(transformer_output)

    # Model
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=qa_outputs)

    return model

# Example usage
max_seq_length = 384
vocab_size = tokenizer.vocab_size  # Adjust based on your tokenizer's vocabulary size
embedding_dim = 768
num_classes = 2  # Number of classes for start and end positions

# Build the simplified model
simple_qa_model = build_simple_qa_model(max_seq_length, vocab_size, embedding_dim, num_classes)

# Compile the model (customize as needed)
simple_qa_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                        metrics=["accuracy"])

# Verify the model summary
simple_qa_model.summary()


In [None]:
# Assuming your dataset is properly formatted with the required features
# Replace 'features' with the actual feature names in your dataset
features = ['input_ids', 'attention_mask']
labels = ['start_positions', 'end_positions']

# Assuming batch size of 4 for illustration
batch_size = 1024

# Train the model
simple_qa_model.fit(
    x={feature: train_dataset[feature] for feature in features},
    y={label: train_dataset[label] for label in labels},
    batch_size=batch_size,
    epochs=3,  # Adjust as needed
    validation_split=0.1  # Adjust validation split as needed
)