# REQUIREMENTS

In [1]:
from datasets import concatenate_datasets
import pickle
import os
import pandas as pd
import sys
import io
import re
import statistics as stats
import matplotlib.pyplot as plt
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F




  from .autonotebook import tqdm as notebook_tqdm


# PREPROCESSING

## Some useful tools

In [2]:
def treatment(ds):
    dataset = ds["train"]
    # We keep only 1000 observations for ramp up the filter computation
    dataset = dataset.select(range(1000))
    dataset = dataset.filter(lambda example: example['figure_type'] == 'Graph Plot')
    # We keep only 500 observations
    dataset = dataset.select(range(500))
    # Removing + renaming
    dataset = dataset.rename_column("mlbcap_short", "context")
    columns_to_keep = ['id', 'context']
    columns_to_remove = [col for col in dataset.column_names if col not in columns_to_keep]
    dataset = dataset.remove_columns(columns_to_remove)
    # We add necessary columns to use a BERT model
    dataset = dataset.map(
        lambda example: {"question": "",
                        "answers": {'text': [], 'answer_start': []}}
    )
    return dataset

In [30]:
def add_question_answer(ds,question, answer_text, answer_start, n_row):
    if answer_start != -1:
        ds = ds.map(
            lambda example, idx: {
                **example,
                "question": question if idx == n_row else example["question"],
                "answers": {
                    "text": [answer_text] if idx == n_row else example["answers"]["text"],
                    "answer_start": [answer_start] if idx == n_row else example["answers"]["answer_start"]
                }
            },
            with_indices=True
        )
        return ds
    else:
        print("⚠️ Réponse non trouvée dans le contexte.")

In [11]:
# We load the dataset
if os.path.exists("datasetNLP.pkl"):
    with open("datasetNLP.pkl", "rb") as f:
        raw_datasets = pickle.load(f)
    print("database imported from an existing file")
else:
    with open("dataset.pkl", "rb") as f:
        ds = pickle.load(f)
        raw_datasets = treatment(ds)
        with open("dataset2.pkl", "wb") as f:
            pickle.dump(raw_datasets, f)
    print('New file created')

database imported from an existing file


In [12]:
raw_datasets

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 500
})

In [4]:
n_row = 0

In [313]:
print(n_row)

133


In [311]:
# Ligne à modifier
n_row +=1
raw_datasets[n_row]['context']

'Figure 4: CDF of dependency tree sizes in npm and PyPI. npm packages have larger trees; 20% exceed 100 dependencies, while 20% of PyPI packages exceed 10.'

In [None]:
context = raw_datasets[n_row]['context']
question = "What does Figure 4 show about the dependency tree sizes in npm and PyPI?"
answer_text = "npm packages have larger trees; 20% exceed 100 dependencies, while 20% of PyPI packages exceed 10."

answer_start = context.index(answer_text)

add_question_answer(raw_datasets, question, answer_text, answer_start)

raw_datasets[n_row]

Map: 100%|██████████| 500/500 [00:00<00:00, 3157.45 examples/s]


{'id': 511159518,
 'context': 'Figure 4: CDF of dependency tree sizes in npm and PyPI. npm packages have larger trees; 20% exceed 100 dependencies, while 20% of PyPI packages exceed 10.',
 'question': 'What does Figure 4 show about the dependency tree sizes in npm and PyPI?',
 'answers': {'answer_start': [56],
  'text': ['npm packages have larger trees; 20% exceed 100 dependencies, while 20% of PyPI packages exceed 10.']}}

In [336]:
with open("datasetNLP.pkl", "wb") as f:
    pickle.dump(raw_datasets, f)

In [None]:
with open("datasetNLP.pkl", "rb") as f:
    raw_datasets = pickle.load(f)

with open("dataset2.pkl", "rb") as f:
    dataset2 = pickle.load(f)

In [None]:
dumb = False

if dumb:
    # Récupérer les ids existants dans raw_datasets
    A = set(raw_datasets['id'])
    
    # Filtrer dataset2 pour ne garder que les lignes dont l'id n'est pas déjà dans raw_datasets
    to_add = dataset2.filter(lambda example: example['id'] not in A)


In [25]:
n_row=0

In [190]:
print(n_row)

64


In [188]:
n_row +=1
to_add[n_row]['context']

"Fig. 6. Numerical solution for λ = 9.89 showing ρ̅_x and ρ̅_m as functions of 'a'. ρ̅_x decreases more rapidly than ρ̅_m."

In [189]:
context = to_add[n_row]['context']
question = "What does Figure 6 show regarding the functions of 'a'?"
answer_text = "Numerical solution for λ = 9.89 showing ρ̅_x and ρ̅_m as functions of 'a'. ρ̅_x decreases more rapidly than ρ̅_m"



answer_start = context.index(answer_text)

to_add = add_question_answer(to_add, question, answer_text, answer_start, n_row)

to_add[n_row]

Map: 100%|██████████| 367/367 [00:00<00:00, 3702.23 examples/s]


{'id': 511159622,
 'context': "Fig. 6. Numerical solution for λ = 9.89 showing ρ̅_x and ρ̅_m as functions of 'a'. ρ̅_x decreases more rapidly than ρ̅_m.",
 'question': "What does Figure 6 show regarding the functions of 'a'?",
 'answers': {'answer_start': [8],
  'text': ["Numerical solution for λ = 9.89 showing ρ̅_x and ρ̅_m as functions of 'a'. ρ̅_x decreases more rapidly than ρ̅_m"]}}

In [None]:
if dumb:
    B = concatenate_datasets([raw_datasets, to_add[n_row]])
    with open("datasetNLP.pkl", "wb") as f:
    pickle.dump(B, f)

# QUESTION ANSWERING

In [2]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
with open("datasetNLP.pkl", "rb") as f:
    raw_datasets = pickle.load(f)

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 106
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 27
    })
})

In [7]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [17]:
max_length = 100
stride = 40


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [18]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Map: 100%|██████████| 106/106 [00:00<00:00, 1726.82 examples/s]


(106, 110)

In [19]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [22]:
small_eval_set = raw_datasets["test"]
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["test"].column_names,
)

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [25]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [26]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [27]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [28]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [30]:
import evaluate

metric = evaluate.load("squad")

Downloading builder script: 100%|██████████| 4.53k/4.53k [00:00<?, ?B/s]
Downloading extra modules: 100%|██████████| 3.32k/3.32k [00:00<?, ?B/s]


In [31]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [39]:
for i in range(20):
    print('-'*15)
    print(f"prédiction:{predicted_answers[i]['prediction_text']}")
    print(f"vraie réponse: {theoretical_answers[i]['answers']['text']}")

---------------
prédiction:θ = 0.05 and varying σ²
vraie réponse: ['Optimized UPS closely follows DPS performance in 2×2 and 4×4 MIMO systems']
---------------
prédiction:Outage performance
vraie réponse: ['NOMA with HARQ-CC consistently outperforms OMA for user 1, while OMA performs better for user 2 when T = 1']
---------------
prédiction:outperforms
vraie réponse: ['LVAMP outperforms tied and untied LAMP in NMSE across layers for κ(A) = 15, showing 2-5 dB better NMSE than tied LAMP for networks with >4 layers.']
---------------
prédiction:Interference decreases
vraie réponse: ['Interference decreases as R increases']
---------------
prédiction:high uncertainty
vraie réponse: ['Hα periods align with photometry; B periods correlate with equatorial features.']
---------------
prédiction:MI, NBL, superfluid unpolar, BEC
vraie réponse: ['MI, NBL, superfluid unpolar, BEC']
---------------
prédiction:improves exponentially
vraie réponse: ['MGE improves exponentially while MSE slightly decr

In [37]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)

ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': {'id': Value(dtype='string', id=None), 'prediction_text': Value(dtype='string', id=None)}, 'references': {'id': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}},
Input predictions: [{'id': 511159490, 'prediction_text': 'θ = 0.05 and varying σ²'}, {'id': 511159503, 'prediction_text': 'Outage performance'}, {'id': 511159470, 'prediction_text': 'outperforms'}, ..., {'id': 511159478, 'prediction_text': 'to calculate the quantum corrected potential'}, {'id': 511159451, 'prediction_text': 'outperforming'}, {'id': 511159380, 'prediction_text': 'time'}],
Input references: [{'id': 511159490, 'answers': {'answer_start': [8], 'text': ['Optimized UPS closely follows DPS performance in 2×2 and 4×4 MIMO systems']}}, {'id': 511159503, 'answers': {'answer_start': [92], 'text': ['NOMA with HARQ-CC consistently outperforms OMA for user 1, while OMA performs better for user 2 when T = 1']}}, {'id': 511159470, 'answers': {'answer_start': [11], 'text': ['LVAMP outperforms tied and untied LAMP in NMSE across layers for κ(A) = 15, showing 2-5 dB better NMSE than tied LAMP for networks with >4 layers.']}}, ..., {'id': 511159478, 'answers': {'answer_start': [10], 'text': ['Schematic illustration of the potential trajectory from (v, fa) to (hc, 0), used to calculate the quantum corrected potential, neglecting quantum corrections due to λhφ and λφ.']}}, {'id': 511159451, 'answers': {'answer_start': [60], 'text': ['Our method achieves 82% accuracy with 230 images, outperforming uncertainty sampling (250 images) and random sampling (255 images)']}}, {'id': 511159380, 'answers': {'answer_start': [56], 'text': ['The wavefront error is plotted as a function of time']}}]