In [None]:
!pip install transformers
!pip install datasets
!pip install wandb
!pip install accelerate -U
!pip install scikit-learn
!pip install sentencepiece


In [1]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mjensthyregod[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [59]:
train_dataset[0].keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [10]:
train_dataset = tydiqa_dataset["train"].filter(lambda example: example['language'] == 'arabic')
val_dataset = tydiqa_dataset["validation"].filter(lambda example: example['language'] == 'arabic')

Filter: 100%|██████████| 116067/116067 [00:02<00:00, 47050.63 examples/s]
Filter: 100%|██████████| 13325/13325 [00:00<00:00, 49515.65 examples/s]


In [11]:
train_dataset

Dataset({
    features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
    num_rows: 29598
})

In [9]:
train_dataset

Dataset({
    features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
    num_rows: 11394
})

In [4]:
train_dataset

Dataset({
    features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
    num_rows: 4779
})

In [1]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoTokenizer
from datasets import load_dataset

# Specify the percentage of the dataset to keep
dataset_percentage = 0.1

# Set the language you want to focus on
language = 'bengali'

# Load the dataset
tydiqa_dataset = load_dataset('copenlu/answerable_tydiqa')

# Filter the dataset for the specified language
train_dataset = tydiqa_dataset["train"].filter(lambda example: example['language'] == language)
val_dataset = tydiqa_dataset["validation"].filter(lambda example: example['language'] == language)

# Sample a subset of the dataset
train_dataset = train_dataset.shuffle(seed=42).select(range(int(len(train_dataset) * dataset_percentage)))
val_dataset = val_dataset.shuffle(seed=42).select(range(int(len(val_dataset) * dataset_percentage)))

# Initialize the tokenizer from the XLM-Roberta model
tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-base-squad2")

def preprocess_function(examples):
    # Tokenize the examples
    tokenized_inputs = tokenizer(
        examples['question_text'],
        examples['document_plaintext'],
        truncation="only_second",
        max_length=512,
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )

    # Extract overflow_to_sample_mapping and remove it from tokenized_inputs
    overflow_to_sample_mapping = tokenized_inputs.pop("overflow_to_sample_mapping")
    offset_mappings = tokenized_inputs.pop("offset_mapping")

    # Initialize new lists for storing outputs
    start_positions = []
    end_positions = []
    answer_texts = []
    
    
    # Iterate through the annotations and calculate start and end token positions
    for i, offsets in enumerate(offset_mappings):
        parent_id = overflow_to_sample_mapping[i]
        answer_start = examples['annotations'][parent_id]['answer_start'][0]
        answer_text = examples['annotations'][parent_id]['answer_text'][0]
        answer_end = answer_start + len(answer_text)

        # Find the start and end token index for the answer
        start_token_idx = end_token_idx = 0
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_token_idx = idx
            if start < answer_end <= end:
                end_token_idx = idx
                break

        start_positions.append(start_token_idx)
        end_positions.append(end_token_idx)
        answer_texts.append(answer_text)

    
    # Return the new lists as a dictionary
    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'start_positions': start_positions,
        'end_positions': end_positions,
        'answer_texts': answer_texts
    }


# Example usage:
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)


Filter: 100%|██████████| 116067/116067 [00:02<00:00, 41896.74 examples/s]
Filter: 100%|██████████| 13325/13325 [00:00<00:00, 42783.62 examples/s]
Map: 100%|██████████| 477/477 [00:00<00:00, 2228.05 examples/s]
Map: 100%|██████████| 22/22 [00:00<00:00, 1412.61 examples/s]


In [7]:
len(train_dataset['question_text'])

4779

In [23]:
import os
import torch
import numpy as np
from sklearn.metrics import f1_score
import torch.nn.functional as F
from transformers import XLMRobertaForQuestionAnswering, XLMRobertaTokenizer, TrainingArguments, Trainer, XLMRobertaConfig, EvalPrediction
import wandb

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=2).flatten()
    labels = p.label_ids.flatten()

    f1 = f1_score(labels, preds, average='weighted')
    exact_match = np.mean(labels == preds)
    euclidean_distance = euclidean(labels, preds)

    return {
        'f1': f1,
        'exact_match': exact_match,
        'euclidean_distance': euclidean_distance
    }

# Ensure TOKENIZERS_PARALLELISM is set to false to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "false"

# Define the language and version number
version = 2

# Initialize wandb
wandb.init(project='NLP_KU_QA', name=f'{language}_v{version}')

# Load the model configuration and set dropout
config = XLMRobertaConfig.from_pretrained("deepset/xlm-roberta-base-squad2", hidden_dropout_prob=0.5, attention_probs_dropout_prob=0.5)

# Load the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("deepset/xlm-roberta-base-squad2")

# Load the model and send it to the GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = XLMRobertaForQuestionAnswering.from_pretrained("deepset/xlm-roberta-base-squad2", config=config).to(device)

# Define the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=20,
    do_train=True,
    do_eval=True,
    output_dir='./results',
    push_to_hub=False,
    logging_first_step=True,
    load_best_model_at_end=True,
    report_to="wandb",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Make sure train_dataset is defined
    eval_dataset=val_dataset,  # Make sure val_dataset is defined
    compute_metrics=compute_metrics,
    data_collator=default_data_collator
)

try:
    # Train and evaluate the model
    trainer.train()
    trainer.evaluate()
finally:
    # Save the model and tokenizer
    model.save_pretrained('./my_roberta_qa_model')
    tokenizer.save_pretrained('./my_roberta_qa_model')  # Make sure tokenizer is defined
    # Ensure wandb session is ended properly
    wandb.finish()


loading configuration file config.json from cache at /Users/jensthyregod/.cache/huggingface/hub/models--deepset--xlm-roberta-base-squad2/snapshots/a17f72834366c08e1442ba44b483983d86d659bf/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "deepset/xlm-roberta-base-squad2",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.5,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.5,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab

: 

In [None]:
from transformers import RobertaForQuestionAnswering, TrainingArguments, Trainer

model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

# Define training arguments
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=3e-5,
    num_train_epochs=2,
    output_dir="./results",
    weight_decay=0.01,
)

# Define trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train
trainer.train()

# Save model
model.save_pretrained("./my_finetuned_model")
tokenizer.save_pretrained("./my_finetuned_model")


NameError: name 'train_dataset' is not defined

In [None]:
for row in train_df.iterrows():
    answer_start = row[1]['annotations']['answer_start'][0]

    answer_text = row[1]['annotations']['answer_text'][0]

    ## add the new answer_start column
    train_df.at[row[0], 'answer_start'] = answer_start

    ## add the new answer_text column
    train_df.at[row[0], 'answer_text'] = answer_text

## cast answer_start to int
train_df['answer_start'] = train_df['answer_start'].astype(int)

In [None]:
for row in val_df.iterrows():
    answer_start = row[1]['annotations']['answer_start'][0]

    answer_text = row[1]['annotations']['answer_text'][0]

    ## add the new answer_start column
    val_df.at[row[0], 'answer_start'] = answer_start

    ## add the new answer_text column
    val_df.at[row[0], 'answer_text'] = answer_text

## cast answer_start to int
val_df['answer_start'] = val_df['answer_start'].astype(int)

In [None]:
train_df[:4000].to_csv('train.csv', index=False)
val_df[:1000].to_csv('val.csv', index=False)

In [None]:
print(val_df.iloc[10]['question_text'])
print(val_df.iloc[10]['document_plaintext'])
print(val_df.iloc[10]['annotations'])

Siapa bapak Teknik industri?
Awal mula Teknik Industri dapat ditelusuri dari beberapa sumber berbeda. Frederick Winslow Taylor sering ditetapkan sebagai Bapak Teknik Industri meskipun seluruh gagasannya tidak asli. Beberapa risalah terdahulu mungkin telah memengaruhi perkembangan Teknik Industri seperti risalah The Wealth of Nations karya Adam Smith, dipublikasikan tahun 1776; Essay on Population karya Thomas Malthus dipublikasikan tahun 1798; Principles of Political Economy and Taxation karya David Ricardo, dipublikasikan tahun 1817; dan Principles of Political Economy karya John Stuart Mill, dipublikasikan tahun 1848. Seluruh hasil karya ini mengilhami penjelasan paham Liberal Klasik mengenai kesuksesan dan keterbatas dari Revolusi Industri. Adam Smith adalah ekonom yang terkenal pada zamannya. "Economic Science" adalah frasa untuk menggambarkan bidang ini di Inggris sebelum industrialisasi America muncul .
{'answer_start': array([73]), 'answer_text': array(['Frederick Winslow Taylor

In [None]:
train_df.to_csv('train.csv', index=False)