In [None]:
import json
from typing import Iterable
from  langchain.schema import Document

#https://github.com/langchain-ai/langchain/issues/3016

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array

_chunks = load_docs_from_jsonl("../1_preproc/chunks.jsonl")

In [None]:
prompt = """Du hast ein Dokument aus einem Softwarehandbuch als Kontext. Auf Basis dieses Dokuments sollst du eine oder mehrere typische Benutzerfragen und die passenden Antworten erstellen. 
Die Fragen sollten Benutzerprobleme widerspiegeln, wie z.B. "Wo finde ich eine bestimmte Einstellung?" oder "Wie kann ich eine bestimmte Funktion erstellen?". 

Bitte beachte die folgenden Punkte:
1. Verwende ausschließlich Informationen aus dem gegebenen Kontext, um die Fragen und Antworten zu formulieren.
2. Die Fragen und Antworten sollen klar, präzise und auf Deutsch verfasst sein.
3. Stelle sicher, dass jede Frage eine direkte Antwort im Kontext hat.

Gib deine Antwort im folgenden Format:

Output:::
FRAGE: (Deine Frage)
ANTWORT: (Deine Antwort)

Hier ist nun der Kontext:

Kontext: {context}
Output:::"""

In [None]:
from llama_cpp import Llama

path = "./models/Mistral-Nemo-Instruct-2407-Q4_K_M.gguf"
    
model = Llama(
        model_path=path,  # path to GGUF file
        n_ctx=10000,  # The max sequence length to use - note that longer sequence lengths require much more resources
        n_gpu_layers=0, # The number of layers to offload to GPU, if you have GPU acceleration available. Set to 0 if no GPU acceleration is available on your system.
    )

In [None]:
import re

docs = _chunks
print(f"Generating QA couples...")

outputs = []
for sampled_context in docs:
    # Generate QA couple
    user_prompt = prompt.format(context=sampled_context.page_content)
    prompt = f"""[INST] {user_prompt} [/INST]"""
    out = model(prompt, max_tokens= 500, stop=["[INST]","[/INST]"], echo=True, stream=True)
    answer = ""
    for chunk in out:
        answer += chunk["choices"][0]["text"]
    output_QA_couple = answer
    print(output_QA_couple)
    try:
        pattern = r"FRAGE:\s*(.*?)\s*ANTWORT:\s*(.*?)(?=\s*FRAGE:|\Z)"
        # Using re.match to extract the question and answer
        frage = ""
        antwort = ""
        for match in re.finditer(pattern, output_QA_couple, re.DOTALL):
            frage = match.group(1).strip()
            antwort = match.group(2).strip()
            created_qa = {
            "context": sampled_context.page_content,
            "question": frage,
            "answer": antwort,
            "meta": sampled_context.metadata,
            }
            outputs.append(created_qa)
            print(created_qa)
        
    except:
        print("Error")
        continue

In [None]:
import json

json_object = json.dumps(outputs, ensure_ascii=False, indent=4)

with open("./RAGDataset.json", "w", encoding="utf-8") as outfile:
    outfile.write(json_object)

#### Grouping questions for context
In order to reduce size of our dataset, we group questions for contexts.\
Our *RAGGroupedDataset* only contains one question for every context chunk.\
We then split our data into a test and a train set.

In [None]:
import json

# Load the original JSON file
with open('./RAGDataset.json', 'r', encoding="utf-8") as file:
    data = json.load(file)

# Dictionary to hold the new structure
grouped_data = {}

# Process each item in the original data
for item in data:
    context = item['context']
    question = item['question']
    if context not in grouped_data:
        grouped_data[context] = []
    grouped_data[context].append(question)

# Convert the defaultdict to a list of dictionaries for saving
result = [{'context': context, 'questions': questions} for context, questions in grouped_data.items()]

with open('RAGGroupedDataset.json', 'w', encoding="utf-8") as file:
    json.dump(result, file, indent=4)

In [None]:
import json
from datasets import Dataset


# Load your dataset from a JSON file
file_path = "RAGGroupedDataset.json"  # Replace with your JSON file path
with open(file_path, 'r', encoding="utf-8") as f:
    data = json.load(f)

processed_data = []
for item in data:
    processed_item = {
            "anchor": item["questions"][0],
            "positive": item["context"],
    }
    processed_data.append(processed_item)



# Combine positive and negative samples
final_data = processed_data

# Convert data to a Dataset object
dataset = Dataset.from_list(final_data)

dataset = dataset.add_column("id", range(len(dataset)))

dataset = dataset.train_test_split(test_size=0.2)
 
train_path = "./RAG_train_dataset.json"
test_path = "./RAG_test_dataset.json"

# save datasets to disk
dataset["train"].to_json(train_path, orient="records")
dataset["test"].to_json(test_path, orient="records")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

219194