In [8]:
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import re
from nltk.tokenize import word_tokenize
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import json
import gradio as gr


In [4]:
with open('merged_data.json', 'r', encoding='utf-8') as json_file:
    merged_data = json.load(json_file)


In [6]:

def prepare_dataset(data):
    dataset = []
    for entry in data['data']:
        if 'paragraphs' in entry:
            for paragraph in entry['paragraphs']:
                context = paragraph['context']
                for qa in paragraph.get('qas', []):
                    dataset.append({
                        'question': qa['question'],
                        'context': context,
                        'answers': {
                            'text': [qa['answers'][0]['text']],
                            'answer_start': [qa['answers'][0]['answer_start']]
                        }
                    })
    return dataset

train_data = prepare_dataset(merged_data)

train_dataset = Dataset.from_list(train_data)

model_name = "aubmindlab/bert-base-arabertv02"  
tokenizer = AutoTokenizer.from_pretrained(model_name)

df = pd.DataFrame(train_data)

df_cleaned = df.dropna()

def convert_to_training_format(df):
    dataset = []
    for index, row in df.iterrows():
        dataset.append({
            'question': row['question'],
            'context': row['context'],
            'answers': {
                'text': [row['answers']['text'][0]],
                'answer_start': [row['answers']['answer_start'][0]]
            }
        })
    return dataset

train_data_cleaned = convert_to_training_format(df_cleaned)

train_dataset_cleaned = Dataset.from_list(train_data_cleaned)

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )

tokenized_dataset = train_dataset_cleaned.map(tokenize_function, batched=True)

def format_dataset(examples):
    start_positions = examples['answers']['answer_start'][0]
    end_positions = start_positions + len(examples['answers']['text'][0])
    
    return {
        'input_ids': examples['input_ids'],
        'attention_mask': examples['attention_mask'],
        'start_positions': start_positions,
        'end_positions': end_positions
    }

formatted_dataset = tokenized_dataset.map(format_dataset)

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="no",        # Disable evaluation
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=16,  # batch size for training
    num_train_epochs=3,              # total number of training epochs
    weight_decay=0.01,               # strength of weight decay
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
)

trainer.train()

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/9 [00:00<?, ?it/s]

{'train_runtime': 655.2505, 'train_samples_per_second': 0.156, 'train_steps_per_second': 0.014, 'train_loss': 4.748199886745876, 'epoch': 3.0}


TrainOutput(global_step=9, training_loss=4.748199886745876, metrics={'train_runtime': 655.2505, 'train_samples_per_second': 0.156, 'train_steps_per_second': 0.014, 'train_loss': 4.748199886745876, 'epoch': 3.0})

In [7]:
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.txt',
 './fine_tuned_model\\added_tokens.json',
 './fine_tuned_model\\tokenizer.json')

In [11]:
model = AutoModelForQuestionAnswering.from_pretrained("./fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model")

def get_answer(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)  
    answer_end = torch.argmax(answer_end_scores) + 1  

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer.strip()

question = "ما هي أعراض السرطان"
context = "أعراض مرض السرطان تشمل العطش الشديد، التبول المتكرر، والشعور بالتعب."
print("Answer:", get_answer(question, context))

Answer: [CLS] ما هي أعراض السرطام [SEP] أعراض مرض السرطان تشمل العطش
