<a href="https://colab.research.google.com/github/GeraudBourdin/llm-scripts/blob/main/bonito_data_instruction_generartion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Needs A100 CPU  (used 34G nvram)

## Step 1 - Install the dependencies

In [30]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install -e git+https://github.com/BatsResearch/bonito#egg=bonito

In [None]:
!pip install datasets huggingface_hub

In [None]:
!pip install pymupdf spacy

## Step 2: Processing the PDF documentExtract Text from PDF

### 2.1 Exract texts

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

pdf_path = 'Catalogue-Hyundai-KONA.pdf'
text = extract_text_from_pdf(pdf_path)
print(text)

### 2.2 Split Text into Sentences

In [None]:
import spacy

spacy.require_gpu()
nlp = spacy.load("en_core_web_sm")  # Load English tokenizer, tagger, parser, NER, and word vectors

def split_into_sentences(text):
    doc = nlp(text)
    print(doc)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

sentences = split_into_sentences(text)


In [None]:
print(sentences)

In [None]:
print(sentences[50])

### 2.3 Create a Transformers Dataset
You'll now transform the sentences into a format suitable for the Hugging Face datasets library.

In [None]:
from datasets import Dataset

# Assuming sentences is a list of strings, where each string is a sentence
data = {"sentence": sentences}
dataset = Dataset.from_dict(data)

print(dataset)


## Step 3 : Generate synthetic dataset using Bonito

In [None]:
from bonito import Bonito, SamplingParams
from datasets import load_dataset

# Initialize the Bonito model
bonito = Bonito("BatsResearch/bonito-v1")



In [None]:
# load dataset with unannotated text
# Supported Task Types [full name (short form)]: extractive question answering (exqa),
# multiple-choice question answering (mcqa),
# question generation (qg),
# question answering without choices (qa),
# yes-no question answering (ynqa),
# coreference resolution (coref),
# paraphrase generation (paraphrase),
# paraphrase identification (paraphrase_id),
# sentence completion (sent_comp),
# sentiment (sentiment),
# summarization (summarization),
# text generation (text_gen),
# topic classification (topic_class),
# word sense disambiguation (wsd), textual entailment (te), natural language inference (nli)
# Generate synthetic instruction tuning dataset

sampling_params = SamplingParams(max_tokens=256, top_p=0.95, temperature=0.5, n=1)
synthetic_dataset = bonito.generate_tasks(
    dataset,
    context_col="sentence",
    task_type="qg",
    sampling_params=sampling_params
)

In [None]:
print(synthetic_dataset)

In [None]:
import pandas as pd

df = pd.DataFrame(synthetic_dataset)

print(df.head(50))  # Adjust the number inside head() to see more or fewer rows


## Step 4 : Saving the generated dataset

4.1Authenticate with Hugging Face

In [None]:
from huggingface_hub import notebook_login

notebook_login()

4.2 Push the dataset to the Hub

In [16]:
from huggingface_hub import create_repo
from huggingface_hub import Repository

repo_name = "test_bonito"  # Choose a name for your dataset repository
repo_url = create_repo(repo_name, repo_type="dataset")
print("Repository URL:", repo_url)






In [None]:
synthetic_dataset.push_to_hub(f"Bourdin/test_bonito")