In [None]:
import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data
file_path = '/content/tydiqa-goldp-dev-arabic.json'
data = load_json(file_path)
print(f"Nombre total d'éléments : {len(data)}")

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Utiliser le chemin complet du modèle téléchargé localement
model_path = "aubmindlab/bert-base-arabertv2"

from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Charger le modèle et le tokenizer NER pour AraBERTv2
ner_model = AutoModelForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv2")
ner_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# Pipeline NER pour extraire les entités nommées
nlp_ner = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)


In [None]:
# Fonction pour extraire les entités nommées d'un texte
def extract_ner_entities(text, nlp_ner):
    ner_results = nlp_ner(text)
    entities = [result['word'] for result in ner_results]
    return entities

In [None]:
def prepare_train_features_and_filter_with_ner(examples, tokenizer, max_length=512):
    contexts = []
    questions = []
    start_positions = []
    end_positions = []

    for article in examples['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            context_entities = extract_ner_entities(context, nlp_ner)  # Appliquer NER sur le contexte

            for qa in paragraph['qas']:
                question = qa['question']
                question_entities = extract_ner_entities(question, nlp_ner)  # Appliquer NER sur la question
                answer = qa['answers'][0]
                start_char = answer['answer_start']
                end_char = start_char + len(answer['text'])

                # Ajouter les entités extraites au contexte et à la question
                enriched_context = context + " " + " ".join(context_entities)
                enriched_question = question + " " + " ".join(question_entities)

                # Encodage avec tokenizer
                encoded = tokenizer(
                    enriched_question,
                    enriched_context,
                    padding='max_length',
                    max_length=max_length,
                    truncation="only_second",
                    return_offsets_mapping=True,
                    return_tensors="pt"
                )

                offsets = encoded['offset_mapping'][0]
                token_start_index = 0
                token_end_index = len(offsets) - 1

                # Ajuster les index de début et fin pour la réponse
                while token_start_index < len(offsets) and offsets[token_start_index][1] <= start_char:
                    token_start_index += 1

                while token_end_index >= 0 and offsets[token_end_index][0] >= end_char:
                    token_end_index -= 1

                # Stocker les données uniquement si elles sont bien alignées
                if token_start_index < len(offsets) and token_end_index >= 0:
                    contexts.append(enriched_context)
                    questions.append(enriched_question)
                    start_positions.append(token_start_index)
                    end_positions.append(token_end_index)

    # Encodage final
    encodings = tokenizer(
        questions,
        contexts,
        padding='max_length',
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )

    encodings.update({
        'start_positions': torch.tensor(start_positions, dtype=torch.long),
        'end_positions': torch.tensor(end_positions, dtype=torch.long)
    })

    return encodings


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader

# Charger le modèle et le tokenizer AraBERTv2 pour la tâche de question-réponse (QA)
arabertv2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
arabertv2_model = AutoModelForQuestionAnswering.from_pretrained("aubmindlab/bert-base-arabertv2")

# Charger les encodages avec NER et préparer le DataLoader
train_encodings_with_ner = prepare_train_features_and_filter_with_ner(data, arabertv2_tokenizer)
train_dataset_with_ner = QADataset(train_encodings_with_ner)
train_dataloader = DataLoader(train_dataset_with_ner, batch_size=8, shuffle=True)

# Configuration du modèle et de l'optimisation
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
arabertv2_model.to(device)
optimizer = AdamW(arabertv2_model.parameters(), lr=5e-5)

# Boucle d'entraînement
for epoch in range(3):
    arabertv2_model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = arabertv2_model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Époque {epoch+1}/3 - Perte Moyenne: {total_loss / len(train_dataloader):.4f}")


In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

def evaluate_model(dataloader, model):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            start_preds = torch.argmax(outputs.start_logits, dim=-1)

            all_labels.extend(start_positions.cpu().numpy())
            all_preds.extend(start_preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Accuracy: {accuracy:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Évaluer le modèle
evaluate_model(train_dataloader, arabertv2_model)


In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
import torch

# Fonction d'évaluation pour calculer accuracy, recall et F1-score après chaque époque
def evaluate_model(dataloader, model):
    model.eval()  # Mettre le modèle en mode évaluation
    all_labels = []
    all_preds = []

    with torch.no_grad():  # Ne pas calculer les gradients pendant l'évaluation
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)

            # Obtenir les prédictions du modèle
            outputs = model(input_ids, attention_mask=attention_mask)
            start_preds = torch.argmax(outputs.start_logits, dim=-1)

            # Ajouter les prédictions et les labels à la liste
            all_labels.extend(start_positions.cpu().numpy())
            all_preds.extend(start_preds.cpu().numpy())

    # Calculer accuracy, recall et F1-score
    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, recall, f1

# Boucle d'entraînement avec évaluation à chaque époque
for epoch in range(5):
    arabertv2_model.train()  # Mettre le modèle en mode entraînement
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Calculer les prédictions et la perte
        outputs = arabertv2_model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        # Calculer les gradients et optimiser
        loss.backward()
        optimizer.step()

    # Calculer la perte moyenne à la fin de chaque époque
    avg_loss = total_loss / len(train_dataloader)

    # Évaluer le modèle après chaque époque
    accuracy, recall, f1 = evaluate_model(train_dataloader, arabertv2_model)

    # Afficher les résultats pour chaque époque
    print(f"Époque {epoch+1}/3 - Perte Moyenne: {avg_loss:.4f} - Accuracy: {accuracy:.4f} - Recall: {recall:.4f} - F1-score: {f1:.4f}")


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW

# Charger le modèle et le tokenizer XLM-Roberta pour la tâche de question-réponse (QA)
xlm_roberta_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
xlm_roberta_model = AutoModelForQuestionAnswering.from_pretrained("xlm-roberta-base")

# Charger les encodages avec NER et préparer le DataLoader
train_encodings_with_ner = prepare_train_features_and_filter_with_ner(data, xlm_roberta_tokenizer)
train_dataset_with_ner = QADataset(train_encodings_with_ner)
train_dataloader = DataLoader(train_dataset_with_ner, batch_size=8, shuffle=True)

# Configuration du modèle et de l'optimisation
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
xlm_roberta_model.to(device)
optimizer = AdamW(xlm_roberta_model.parameters(), lr=5e-5)


In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
import torch

# Fonction d'évaluation pour calculer accuracy, recall et F1-score après chaque époque
def evaluate_model(dataloader, model):
    model.eval()  # Mettre le modèle en mode évaluation
    all_labels = []
    all_preds = []

    with torch.no_grad():  # Ne pas calculer les gradients pendant l'évaluation
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)

            # Obtenir les prédictions du modèle
            outputs = model(input_ids, attention_mask=attention_mask)
            start_preds = torch.argmax(outputs.start_logits, dim=-1)

            # Ajouter les prédictions et les labels à la liste
            all_labels.extend(start_positions.cpu().numpy())
            all_preds.extend(start_preds.cpu().numpy())

    # Calculer accuracy, recall et F1-score
    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, recall, f1

# Boucle d'entraînement avec évaluation à chaque époque
for epoch in range(10):
    xlm_roberta_model.train()  # Mettre le modèle en mode entraînement
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Calculer les prédictions et la perte
        outputs = xlm_roberta_model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        # Calculer les gradients et optimiser
        loss.backward()
        optimizer.step()

    # Calculer la perte moyenne à la fin de chaque époque
    avg_loss = total_loss / len(train_dataloader)

    # Évaluer le modèle après chaque époque
    accuracy, recall, f1 = evaluate_model(train_dataloader, xlm_roberta_model)

    # Afficher les résultats pour chaque époque
    print(f"Époque {epoch+1}/3 - Perte Moyenne: {avg_loss:.4f} - Accuracy: {accuracy:.4f} - Recall: {recall:.4f} - F1-score: {f1:.4f}")
