In [None]:
!pip install transformers
!pip install sentencepiece ## restart kernel efter install hvis du mangler denne ##
!pip install accelerate -U
!pip install numpy --upgrade
!pip install tensorflow --upgrade



In [8]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

arabic_model_path = "arabic_xlm_roberta_qa_model"
indonesian_model_path = "/Users/jensthyregod/Desktop/KU/7. Semester/NLP/load_and_inference/indo_xlm_roberta_qa_model"
bengali_model_path = "/Users/jensthyregod/Desktop/KU/7. Semester/NLP/load_and_inference/bengali_xlm_roberta_qa_model"

def load_transformer_model(model_path):
    """
    Load a transformer model and tokenizer from a given directory.

    :param model_path: Path to the directory containing model files.
    :return: A tuple of (model, tokenizer).
    """
    # Load pre-trained model
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    
    # Load pre-trained model tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    return model, tokenizer

arabic_model, arabic_tokenizer = load_transformer_model(arabic_model_path)
indonesian_model, indonesian_tokenizer = load_transformer_model(indonesian_model_path)
bengali_model, bengali_tokenizer = load_transformer_model(bengali_model_path)


---
### Load datasættet tokenize og preprocesser

In [2]:
from datasets import load_dataset

def preprocess_tydiqa_dataset(language, tokenizer, dataset_subset=1.0):
    # Load the dataset
    tydiqa_dataset = load_dataset('copenlu/answerable_tydiqa')

    # Filter the dataset for the specified language
    train_dataset = tydiqa_dataset["train"].filter(lambda example: example['language'] == language)
    val_dataset = tydiqa_dataset["validation"].filter(lambda example: example['language'] == language)

    # Sample a subset of the dataset
    train_dataset = train_dataset.shuffle(seed=42).select(range(int(len(train_dataset) * dataset_subset)))
    val_dataset = val_dataset.shuffle(seed=42).select(range(int(len(val_dataset) * dataset_subset)))

    def preprocess_function(examples):
        # Tokenize the examples
        tokenized_inputs = tokenizer(
            examples['question_text'],
            examples['document_plaintext'],
            truncation="only_second",
            max_length=512,
            padding="max_length",
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
        )

        # Extract overflow_to_sample_mapping and remove it from tokenized_inputs
        overflow_to_sample_mapping = tokenized_inputs.pop("overflow_to_sample_mapping")
        offset_mappings = tokenized_inputs.pop("offset_mapping")

        # Initialize new lists for storing outputs
        start_positions = []
        end_positions = []
        answer_texts = []
        
        # Iterate through the annotations and calculate start and end token positions
        for i, offsets in enumerate(offset_mappings):
            parent_id = overflow_to_sample_mapping[i]
            answer_start = examples['annotations'][parent_id]['answer_start'][0]
            answer_text = examples['annotations'][parent_id]['answer_text'][0]
            answer_end = answer_start + len(answer_text)

            # Find the start and end token index for the answer
            start_token_idx = end_token_idx = 0
            for idx, (start, end) in enumerate(offsets):
                if start <= answer_start < end:
                    start_token_idx = idx
                if start < answer_end <= end:
                    end_token_idx = idx
                    break

            start_positions.append(start_token_idx)
            end_positions.append(end_token_idx)
            answer_texts.append(answer_text)

        # Return the new lists as a dictionary
        return {
            'input_ids': tokenized_inputs['input_ids'],
            'attention_mask': tokenized_inputs['attention_mask'],
            'start_positions': start_positions,
            'end_positions': end_positions,
            'answer_texts': answer_texts
        }

    # Preprocess the datasets
    train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
    val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

    return train_dataset, val_dataset


language = 'bengali'
train_dataset, val_dataset = preprocess_tydiqa_dataset(language, bengali_tokenizer)



In [11]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions', 'answer_texts'],
    num_rows: 231
})

In [10]:
import torch
from torch.utils.data import DataLoader
from transformers import DefaultDataCollator

# Move model to CPU
bengali_model = bengali_model.to("cpu")

# Collate function to prepare data batches
data_collator = DefaultDataCollator(return_tensors="pt")

# DataLoader for validation set
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)

# Evaluation loop
bengali_model.eval()
with torch.no_grad():
    for batch in val_dataloader:
        # Move batch to CPU
        batch = {k: v.to("cpu") for k, v in batch.items()}
        
        # Forward pass
        outputs = bengali_model(**batch)
        print(outputs)
        break
        # evaluer her


QuestionAnsweringModelOutput(loss=tensor(2.0060), start_logits=tensor([[  1.6130,  -1.0502,   1.5515,  ..., -10.2602, -10.2602, -10.2602],
        [  2.1556,  -0.8775,  -3.9784,  ..., -10.1693, -10.1693, -10.1693],
        [  2.8227,  -4.2698,  -7.3443,  ..., -10.1205, -10.1205, -10.1205],
        ...,
        [  1.8547,   3.0382,   2.7313,  ..., -10.1752, -10.1752, -10.1752],
        [  2.8709,  -2.9245,  -5.7329,  ..., -10.1765, -10.1765, -10.1765],
        [  1.4706,  -2.7841,   0.1186,  ..., -10.2539, -10.2539, -10.2539]]), end_logits=tensor([[  1.4453,  -2.7030,  -0.5691,  ..., -10.1299, -10.1299, -10.1299],
        [  2.0750,  -4.4722,  -4.5973,  ..., -10.2238, -10.2238, -10.2238],
        [  2.7867,  -5.3089,  -6.2024,  ..., -10.2346, -10.2346, -10.2346],
        ...,
        [  1.6457,   1.3429,   2.1242,  ..., -10.2027, -10.2027, -10.2027],
        [  2.7924,  -4.2957,  -5.5351,  ..., -10.1938, -10.1938, -10.1938],
        [  1.3832,  -3.9120,  -2.7065,  ..., -10.0848, -10.084

tensor([  1.4453,  -2.7030,  -0.5691,  -1.2203,  -0.2837,  -0.3705,  -1.3340,
         -1.2962,  -1.5814,  -0.6314,  -0.9357,   0.1690,   0.1076,   0.0597,
         -0.3225,  -1.0560,  -8.6460,  -9.6692, -10.1374, -10.5395,  -9.9485,
         -8.7089,  -8.5297,  -9.4603,  -9.9376,  -9.5110,  -8.7125,  -8.9490,
         -9.8542,  -8.7693,  -9.4770,  -6.4012,  -5.8331,  -8.8367,  -8.2149,
         -8.3723,  -5.7131,  -8.8147,  -5.9789,  -5.5913,  -0.5243,  -3.3159,
         -8.4252,  -7.3579,  -9.3981, -10.4337, -10.2074,  -9.5572,  -7.7554,
         -8.0769,  -4.5249,  -9.1921,  -9.6442,  -9.6898,  -9.4602,  -9.4747,
         -9.1562,  -9.2654,  -5.7047,  -6.9601,  -2.8232,   2.1845,   2.5153,
          5.4442,   9.3694,   4.7998,   8.2264,  -7.8084,  -7.0464,  -2.2540,
         -7.8890,  -7.9291,  -8.3884,  -7.5835,  -5.5980,  -8.7189,  -8.2548,
         -4.3544,  -7.9930,  -9.1902,  -6.3661,  -7.4894,  -6.7069,   0.9392,
         -1.0145,  -7.7855,  -6.4676,  -7.3158,  -5.9406,   0.95