In [None]:
import json
import os
import time
from dotenv import load_dotenv
from datasets  import load_dataset
from langchain_community.document_loaders import DirectoryLoader
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.testset import TestsetGenerator
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#N_GENERATIONS = 5  # Generate only 5 QA pairs for cost and time considerations
#dataset = load_dataset("json", data_files="/mnt/c/Users/User/thesis/data_import/exp1/even_more_filtered_riksdag.json", split="train")
#dataset = dataset.filter(lambda x: x["dok_id"] in ["H90968", "H90982"])
#url = "https://api.openai.com/v1/chat/completions"
#generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
#generator_llm = LangchainLLMWrapper(langchain_llm: BaseLanguageModel[BaseMessage], run_config: Optional[RunConfig] = None, is_finished_parser: Optional[Callable[[LLMResult], bool]] = None, cache: Optional[CacheInterface] = None)
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

path = "/mnt/c/Users/User/thesis/data_import/exp1/testing_riksdag_data/filtered_riksdag_exp1.json"
#loader = DirectoryLoader(path, glob="**/*.md")
#docs = loader.load()

#¤dataset = load_dataset("json", data_files=path, split="train")
#docs = [Document(page_content=doc["anforandetext"], metadata={"source": doc.get("dok_titel", "unknown")})for doc in dataset]

#dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

prompt = ChatPromptTemplate.from_template(
    template="""You are a helpful model tasked with generating questions and answers 
            for an information retrieval test set based on debates in the Swedish parliament (Riksdagen).
            You will formulate questions based on statements made by various politicians in Sweden, including their names and party affiliations.

            The questions must always reference the names of the politicians and the political parties they represent, 
            and should reflect the content of their statements in the debates.

            **Important:**
            - Write all responses exclusively in Swedish.
            - Ensure that the questions always include the name of a politician and/or the party they belong to.
            - The questions must focus on specific issues discussed by the politicians in debates. Do not limit yourself to only one party or politician.
            
            Example questions (do not use specific names or parties in the examples):
            Vad säger en politiker från [party] om [policy/issue]?
            Hur argumenterar [politician] från [party] för [policy/issue]?
            Vad tycker en politiker från [party] om [policy/issue]?
            
            The model should generate questions like these, but always replace [party] with the name of a political party, 
            and [policy/issue] with the actual topic being discussed in the debate, using real politician names and party affiliations."""
)


#input_text="Only write in proper, formal Swedish. Don't hallucinate"

# Initialize model with API key
generator_llm = LangchainLLMWrapper(
    ChatOpenAI(model="gpt-4o", openai_api_key=OPENAI_API_KEY)
)
generator = TestsetGenerator(transforms_llm=generator_llm, transforms_embedding_model=generator_embeddings)
#formatted_prompt = prompt.format_prompt(input_text=input_text)

# Generate response using the wrapped model
#response = generator_llm.generate_text(formatted_prompt)


In [53]:

output_path = "/mnt/c/Users/User/thesis/data_import/exp1/generated_testset_2.json"
dataset = load_dataset("json", data_files=path, split="train")
data_list = [doc for doc in dataset]

CHUNK_SIZE = 10  # Process 10 documents at a time

def process_chunk(chunk):
    docs = list()
    for doc in chunk:
        # Format the prompt with the input_text from the document
        formatted_prompt = prompt.format_prompt(input_text=prompt)
        print(formatted_prompt)
        
        # Access the formatted text content from the prompt
        page_content = (
            f"Anförande ID: {doc['anforande_id']}\n"
            f"Talare: {doc['talare']}\n"
            f"Parti: {doc['parti']}\n"
            f"Anförandetext:\n{doc['anforandetext']}")#doc["anforandetext"]  # This is the formatted content
        

        # Print to verify the content before creating the Document
        print(page_content)
        
        # Create the Document with the properly formatted page_content
        docs.append(Document(
            page_content=page_content, 
            metadata={"source": doc.get("dok_id", "unknown")}
        ))

    # Process documents further
    processed_data = generator.generate_with_langchain_docs(docs, testset_size=10)
    return [serialize_object(item) for item in processed_data]


def serialize_object(obj):
    if isinstance(obj, dict):
        return {key: serialize_object(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [serialize_object(item) for item in obj]
    elif hasattr(obj, "__dict__"):
        return serialize_object(obj.__dict__)  # Recursively convert __dict__
    elif isinstance(obj, (str, int, float, bool, type(None))):
        return obj  # Primitive types are JSON serializable
    else:
        return str(obj)  # Fallback for unknown types


with open(output_path, "a", encoding="utf-8") as f:
    for i in range(0, len(data_list), CHUNK_SIZE):
        chunk = data_list[i : i + CHUNK_SIZE]
        processed_docs = process_chunk(chunk)
        
        # Save results as JSON (appending to file)
        for item in processed_docs:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
        
        # Avoid hitting rate limits
        time.sleep(1) 

messages=[HumanMessage(content='You are a helpful model tasked with generating questions and answers \n            for an information retrieval test set based on debates in the Swedish parliament (Riksdagen).\n            You will formulate questions based on statements made by various politicians in Sweden, including their names and party affiliations.\n\n            The questions must always reference the names of the politicians and the political parties they represent, \n            and should reflect the content of their statements in the debates.\n\n            **Important:**\n            - Write all responses exclusively in Swedish.\n            - Ensure that the questions always include the name of a politician and/or the party they belong to.\n            - The questions must focus on specific issues discussed by the politicians in debates. Do not limit yourself to only one party or politician.\n            \n            Example questions (do not use specific names or parties 

Applying HeadlinesExtractor:   0%|          | 0/6 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/10 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/10 [00:00<?, ?it/s]

Property 'summary' already exists in node '561066'. Skipping!
Property 'summary' already exists in node '255bee'. Skipping!
Property 'summary' already exists in node '75aeb4'. Skipping!
unable to apply transformation: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}


Applying CustomNodeFilter:   0%|          | 0/4 [00:00<?, ?it/s]

unable to apply transformation: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
unable to apply transformation: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
unable to apply transformation: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'c

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/18 [00:00<?, ?it/s]

unable to apply transformation: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
unable to apply transformation: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
unable to apply transformation: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'c

KeyboardInterrupt: 

In [51]:
docs = [
    Document(
        page_content=prompt.format(input_text=doc["anforandetext"]),
        metadata={"source": doc.get("dok_id", "unknown")}
    )
    for doc in chunk
]

for doc in chunk:
    #print(str(prompt)+doc["anforandetext"])
    page_content = doc["anforande_id"]+doc["talare"]+doc["parti"]+doc["anforandetext"]  # This is the formatted content
    print(doc)
    print(page_content)

#processed_data = generator.generate_with_langchain_docs(docs, testset_size=10)

for doc in docs:
    # Access the metadata and page_content for each document
    source = doc.metadata.get('source', 'Unknown')
    content = doc.page_content
    
    # Now you can use the content and metadata in your logic
    print(f"Source: {source}")
    print(f"Content: {content[:100]}...")  # Print the first 100 characters of the content for brevity
    print()  # Print a blank line between documents

{'Unnamed: 0': '360543', 'dok_hangar_id': '5129347', 'dok_id': 'H909103', 'dok_titel': 'Protokoll 2021/22:103 Onsdagen den 27 april', 'dok_rm': '2021/22', 'dok_nummer': '103', 'dok_datum': '2022-04-27 00:00:00', 'avsnittsrubrik': 'Hyresrätt m.m.', 'kammaraktivitet': 'ärendedebatt', 'anforande_id': '380d7667-bb23-ed11-9178-901b0e9b71a8', 'anforande_nummer': '135', 'talare': 'Larry Söder', 'parti': 'KD', 'anforandetext': '<p>Fru talman! I detta betänkande föreslår utskottet två tillkännagivanden till regeringen med anledning av motionsyrkanden om att främja ägarlägenheter och om hyrköp. Vi kristdemokrater står bakom båda dessa.</p><p>En ägarlägenhet är som ett radhus på höjden, där du köper din fastighet och blir lagfaren ägare. Det innebär låga fasta kostnader samt personlig frihet att hyra ut din bostad utan tillstånd eller att sälja till vem du vill. Så skriver till exempel JM på sin hemsida. Jag tror att denna form, tillsammans med ett hyrköpssystem, skulle kunna öka mångfalden på bo

In [7]:
output_path = "/mnt/c/Users/User/thesis/data_import/exp1/generated_testset_2.json"
dataset = dataset.to_list()  # Ensure Testset has this method

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(dataset, f, indent=4, ensure_ascii=False)

print(f"Dataset saved to {output_path}")

Dataset saved to /mnt/c/Users/User/thesis/data_import/exp1/generated_testset_2.json
