In [1]:
! pip install datasets==2.19.0

Collecting datasets==2.19.0
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets==2.19.0)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.19.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==2.19.0)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets==2.19.0)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.3.1,>=2023.1.0 (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets==2.19.0)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.19.0)
  Downloading multiprocess-0.70.16-py311-none-any.wh

In [2]:
import random
def convert_dataset(dataset, feature_context, feature_question, source):
    """
    Converts a dataset into a structured format suitable for training/testing.

    Args:
        dataset (Dataset): The input dataset.
        feature_context (str): The key corresponding to the context in the dataset.
        feature_question (str): The key corresponding to the question in the dataset.
        source (str): The dataset source identifier.

    Returns:
        List[dict]: A list of structured samples with positive and negative examples.
    """
    dataset_samples=[]

    for data_index in range(dataset.num_rows):
        df = dataset[data_index]

        # Check if the question field contains multiple questions
        if isinstance(df[feature_question], list):
            questions_list = list(set(df[feature_question]))  # Remove duplicate questions
            randon_num = random.sample(range(dataset.num_rows), len(questions_list)+1)

            # Ensure the current index is not in the negative sample set
            if data_index in randon_num:
                randon_num.remove(data_index)
            else:
                randon_num = randon_num[:-1]

            # Create positive examples
            for ques_index in range(len(questions_list)):
                question = questions_list[ques_index]
                inp_example = {
                    'texts': [df[feature_context], question],
                    'label': 1,   # Positive sample
                    'source': source
                }
                dataset_samples.append(inp_example)

            # Create negative examples
            for neg_data_index in randon_num:
                neg_questions = dataset[neg_data_index][feature_question]
                neg_example = {
                    'texts': [df[feature_context], random.choice(neg_questions)],
                    'label': 0,   # Negative sample
                    'source': source
                }
                dataset_samples.append(neg_example)

        else:
            # Handle case when there's a single question in the dataset
            inp_example = {
                'texts': [df[feature_context], df[feature_question]],
                'label': 1,  # Positive sample
                'source':source
            }
            dataset_samples.append(inp_example)

            # Generate negative samples
            randon_num = random.sample(range(dataset.num_rows), 2)
            if data_index in randon_num:
                randon_num.remove(data_index)
            else:
                randon_num = randon_num[:-1]

            neg_example = {
                'texts': [df[feature_context], dataset[randon_num][feature_question]],
                'label': 0,  # Negative sample
                'source': source
            }
            dataset_samples.append(neg_example)

    return dataset_samples

In [3]:
def convert_dataset_stsd(dataset):
    """
    Converts the STS-D dataset into a structured format.

    Args:
        dataset (Dataset): The input dataset containing sentence pairs and similarity scores.

    Returns:
        List[dict]: A list of structured samples with normalized similarity scores.
    """
    dataset_samples=[]
    for df in dataset:
        score = float(df['similarity_score'])/5.0  # Normalize score to range 0 ... 1
        inp_example = {
            'texts': [df['sentence1'], df['sentence2']],
            'label': score,  # Continuous score instead of binary labels
            'source': 'stsd-fr'  # Source identifier
        }
        dataset_samples.append(inp_example)
    return dataset_samples


In [15]:
# Import required libraries
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from sentence_transformers.readers import InputExample

### Load and preprocess the Pandora-s dataset
print("Loading the Pandora-s dataset")
dataset_pandora_train_raw = load_dataset('pandora-s/neural-bridge-rag-dataset-12000-google-translated', split='train')
dataset_pandora_train = dataset_pandora_train_raw.filter(lambda example: example['language'] == 'fr')
dataset_pandora_test_raw = load_dataset('pandora-s/neural-bridge-rag-dataset-12000-google-translated', split='test')
dataset_pandora_test = dataset_pandora_test_raw.filter(lambda example: example['language'] == 'fr')

# Convert the dataset into the required format
# features: ['language', 'context', 'question', 'answer']
data_pandora_train = convert_dataset(dataset_pandora_train, 'context', 'question', 'pandora')
data_pandora_test = convert_dataset(dataset_pandora_test, 'context', 'question', 'pandora')

### Load and preprocess the FQuAD dataset
print("Loading the fquad dataset")
dataset_fquad_train = load_dataset('fquad', data_dir='./download-form-fquad1.0', split='train')
dataset_fquad_valid = load_dataset('fquad', data_dir='./download-form-fquad1.0', split='validation')

# Convert the dataset into the required format
# features: ['context', 'questions', 'answers']
data_fquad_train = convert_dataset(dataset_fquad_train, 'context', 'questions', 'fquad')
data_fquad_test = convert_dataset(dataset_fquad_valid, 'context', 'questions', 'fquad')

### Load and preprocess the PIAF dataset
print("Loading the piaf dataset")
piaf_dataset_init = pd.read_json('./piaf/piaf-v1.2.json')
piaf_dataset_entries = []
for item in piaf_dataset_init["data"]:
    title = item["title"]
    for paragraph in item["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            entry = {
                "id": qa["id"],
                "title": title,
                "context": context,
                "question": qa["question"],
                "answers": {
                    "text": [ans["text"] for ans in qa["answers"]],
                    "answer_start": [ans["answer_start"] for ans in qa["answers"]]
                }
            }
            piaf_dataset_entries.append(entry)
piaf_df = pd.DataFrame(piaf_dataset_entries)

# Convert the dataset into the required format
# features: ['id', 'title', 'context', 'question', 'answers']
piaf_dataset = Dataset.from_pandas(piaf_df).train_test_split(test_size=0.3, seed=42)
piaf_dataset_dict = DatasetDict({
    "train": piaf_dataset["train"],
    "test": piaf_dataset["test"]
})
data_piaf_train = convert_dataset(piaf_dataset_dict["train"], 'context', 'question', 'piaf')
data_piaf_test = convert_dataset(piaf_dataset_dict["test"], 'context', 'question', 'piaf')

### Load and preprocess the squad-french dataset
print("Loading the squad-french dataset")
dataset_squad_fr_train = load_dataset('LeviatanAIResearch/squad-french-context-question', split='train')
dataset_squad_fr_valid = load_dataset('LeviatanAIResearch/squad-french-context-question', split='test')

# Convert the dataset into the required format
# features: ['context', 'question']
data_squad_train = convert_dataset(dataset_squad_fr_train, 'context', 'question', 'squad-fr')
data_squad_valid = convert_dataset(dataset_squad_fr_valid, 'context', 'question', 'squad-fr')

### Load and preprocess the stsd-french dataset
print("Loading the stsd-french dataset")
dataset_stsd_train = load_dataset("stsb_multi_mt", name="fr", split="train")
dataset_stsd_dev = load_dataset("stsb_multi_mt", name="fr", split="dev")
dataset_stsd_test = load_dataset("stsb_multi_mt", name="fr", split="test")
# Convert the dataset into the required format
data_stsd_train = convert_dataset_stsd(dataset_stsd_train)
data_stsd_dev = convert_dataset_stsd(dataset_stsd_dev)
data_stsd_test = convert_dataset_stsd(dataset_stsd_train)

Loading the Pandora-s dataset
Loading the fquad dataset


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading the piaf dataset
Loading the squad-french dataset


Downloading data:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84943 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10067 [00:00<?, ? examples/s]

Loading the stsd-french dataset


Downloading readme:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/542k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/123k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/163k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [22]:
# Combine all datasets into training and testing sets
data_train = data_pandora_train + data_fquad_train + data_piaf_train + data_squad_train + data_stsd_train
data_test = data_pandora_test + data_fquad_test + data_piaf_test + data_squad_valid + data_stsd_test + data_stsd_dev

# Display dataset sizes
print('len train set', len(data_train))
print('len test set', len(data_test))

len train set 248001
len test set 44093


In [24]:
from tqdm import tqdm

def create_csv(dataset):
    """
    Converts the dataset into a Pandas DataFrame for CSV storage.

    Args:
        dataset (List[dict]): The input dataset.

    Returns:
        pd.DataFrame: A DataFrame containing the dataset.
    """
    df = pd.DataFrame(columns=['text1', 'text2', 'label', 'source'])
    for d in tqdm(dataset):
        texts = d['texts']
        new_df = pd.DataFrame(
            {'text1': texts[0], 'text2': texts[1], 'label': d['label'], 'source': d['source']},
            index=[0]
        )
        df = pd.concat([df, new_df], ignore_index=True)
    return df

In [25]:
# Convert datasets to DataFrames
df_train = create_csv(data_train)
df_test = create_csv(data_test)

# Create Hugging Face dataset dictionary
datasets_train_test = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test)
    })

100%|██████████| 248001/248001 [45:10<00:00, 91.49it/s]
100%|██████████| 44093/44093 [01:30<00:00, 485.66it/s]


In [None]:
from huggingface_hub import login

# Authenticate using your Hugging Face token
# Replace 'hf_your_token' with your actual token
login('hf_your_token')

# Push the processed dataset to Hugging Face Hub
# Replace 'your_username/your_dataset_name' with your Hugging Face dataset repository name
datasets_train_test.push_to_hub('your_username/your_dataset_name')