In [1]:
pip install datasets transformers torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
import torch

In [3]:
def split_and_transform_csv_to_datasets_format(input_csv, train_ratio=0.8):
    df = pd.read_csv(input_csv)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)


    train_size = int(len(df) * train_ratio)
    train_df = df.iloc[:train_size]
    validation_df = df.iloc[train_size:]

    def transform_subset(df):
        records = []
        for _, row in df.iterrows():
            answer_start = row['context'].find(row['answer']) if not row['is_impossible'] else -1
            
            record = {
                "id": row['id'],
                "title": row['title'],
                "context": row['context'],
                "question": row['question'],
                "answers": {
                    "text": [row['answer']] if not row['is_impossible'] else [],
                    "answer_start": [answer_start] if not row['is_impossible'] else []
                },
                "is_impossible": row['is_impossible']
            }
            records.append(record)
        return records

    train_records = transform_subset(train_df)
    validation_records = transform_subset(validation_df)

    train_dataset = Dataset.from_pandas(pd.DataFrame(train_records))
    validation_dataset = Dataset.from_pandas(pd.DataFrame(validation_records))

    return DatasetDict({"train": train_dataset, "validation": validation_dataset})

input_csv = "/kaggle/input/squad-smallset/smallset.csv"  

data = split_and_transform_csv_to_datasets_format(input_csv)

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForQuestionAnswering.from_pretrained("roberta-base")

def preprocess_function(examples):
    tokenized = tokenizer(
        examples["question"], examples["context"], truncation=True, padding="max_length", max_length=512
    )
    start_positions = []
    end_positions = []
    for answers in examples["answers"]:
        if answers["text"]:
            start = answers["answer_start"][0]
            end = start + len(answers["text"][0])
        else:
            start = 0
            end = 0
        start_positions.append(start)
        end_positions.append(end)
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

data = data.map(preprocess_function, batched=True, remove_columns=["id", "title", "context", "question", "answers", "is_impossible"])

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
)

trainer.train()


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,4.2946,4.252081
2,4.2105,4.224939
3,4.3888,4.09765
4,3.9553,4.137411
5,3.3954,4.180501
6,3.3245,4.358981
7,3.4987,4.654997
8,2.9967,4.872603
9,2.2684,5.032557
10,2.6575,5.227822


KeyboardInterrupt: 

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
import torch

dataset = load_dataset("squad")

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForQuestionAnswering.from_pretrained("roberta-base")

=def preprocess_function(examples):
=    tokenized = tokenizer(
        examples["question"], examples["context"], truncation=True, padding="max_length", max_length=512
    )

    start_positions = []
    end_positions = []
    for i in range(len(examples["answers"])):
        answers = examples["answers"][i]
        if answers["text"]:
            start = answers["answer_start"][0]
            end = start + len(answers["text"][0])
        else:
            start = 0
            end = 0
        start_positions.append(start)
        end_positions.append(end)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

train_data = dataset["train"].map(preprocess_function, batched=True, remove_columns=["id", "title", "context", "question", "answers"])
validation_data = dataset["validation"].map(preprocess_function, batched=True, remove_columns=["id", "title", "context", "question", "answers"])

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=3,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
)

trainer.train()


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 