# Projet 1 — Classification et compréhension automatique de documents éducatifs avec BERT

---

## 1. Introduction

Ce notebook implémente un pipeline complet de **compréhension de lecture automatique** basé sur des modèles de type **BERT**, appliqué au dataset **RACE (Reading Comprehension Dataset)**.

L'approche retenue ici est **la classification multi-choix**, plus simple et parfaitement adaptée au format QCM du corpus RACE.

---

## 2. Import des bibliothèques

In [2]:
import os
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import (
    BertTokenizer,
    BertForMultipleChoice,
    get_linear_schedule_with_warmup
)

from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


## 3. Chargement et analyse du dataset RACE

### 3.1 Structure attendue

```
RACE/
 ├── train/
 │    ├── high/
 │    └── middle/
 ├── dev/
 └── test/
```

### 3.2 Fonction de chargement

In [3]:
def load_race_data(path):
    data = []
    for root, _, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    sample = json.load(f)
                    for i in range(len(sample['questions'])):
                        data.append({
                            'article': sample['article'],
                            'question': sample['questions'][i],
                            'options': sample['options'][i],
                            'answer': sample['answers'][i]
                        })
    return data

In [4]:
train_data = load_race_data('RACE/train')
dev_data   = load_race_data('RACE/dev')

print(f"Train samples : {len(train_data)}")
print(f"Dev samples   : {len(dev_data)}")

Train samples : 87866
Dev samples   : 4887


## 4. Prétraitement et tokenisation

### 4.1 Initialisation du tokenizer


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### 4.2 Dataset PyTorch

In [6]:
class RACE_Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = []

        for option in item['options']:
            encoded = self.tokenizer(
                item['article'],
                item['question'] + ' ' + option,
                truncation=True,
                padding='max_length',
                max_length=self.max_len,
                return_tensors='pt'
            )
            inputs.append({
                'input_ids': encoded['input_ids'].squeeze(),
                'attention_mask': encoded['attention_mask'].squeeze()
            })

        label = ord(item['answer']) - ord('A')

        return {
            'input_ids': torch.stack([x['input_ids'] for x in inputs]),
            'attention_mask': torch.stack([x['attention_mask'] for x in inputs]),
            'labels': torch.tensor(label)
        }

In [7]:
train_dataset = RACE_Dataset(train_data, tokenizer)
dev_dataset   = RACE_Dataset(dev_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
dev_loader   = DataLoader(dev_dataset, batch_size=4)

## 5. Modèle BERT pour QCM

In [8]:
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
model.to('cuda' if torch.cuda.is_available() else 'cpu')

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

## 6. Entraînement

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 2
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [10]:
def train_epoch(model, loader):
    model.train()
    losses = []

    for batch in tqdm(loader):
        optimizer.zero_grad()
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())

    return np.mean(losses)

In [None]:
for epoch in range(num_epochs):
    loss = train_epoch(model, train_loader)
    print(f"Epoch {epoch+1} | Loss : {loss:.4f}")

  0%|          | 0/21967 [00:00<?, ?it/s]

## 7. Évaluation

In [None]:
def evaluate(model, loader):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in loader:
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device)
            )
            predictions = torch.argmax(outputs.logits, dim=1)
            preds.extend(predictions.cpu().numpy())
            labels.extend(batch['labels'].numpy())

    return accuracy_score(labels, preds)

In [None]:
acc = evaluate(model, dev_loader)
print(f"Accuracy validation : {acc:.4f}")

---

## 8. Analyse des résultats

* Les questions factuelles courtes sont mieux traitées
* Les longs passages dégradent la performance (limite 512 tokens)
* Les questions nécessitant une inférence implicite restent difficiles

---

## 9. Conclusion

Ce projet démontre l'efficacité de **BERT pour la compréhension de lecture éducative**, tout en mettant en évidence ses limites sur des textes longs et complexes.

Des améliorations possibles incluent :

* RoBERTa ou Longformer
* Fine-tuning plus long
* Approche Question-Answering extractive
