In [9]:
import torch 
import chromadb_handler as CH
import pandas as pd

In [10]:
import PyPDF2
import json
from pathlib import Path
import os

# turn true if rerun of data
cast_data = False

if cast_data:
    directory_str = "data/DUUIDataset/training"
    directory = os.fsencode(directory_str)
    counter = 1    
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        print(filename)
        if filename.endswith(".pdf") : 
            PDF_PATH = directory_str + "/" + filename
            CHUNK_SIZE = 300  # Anzahl Zeichen, ≈ 500–700 Tokens

            # -----------------------------
            # 1. PDF Laden
            # -----------------------------
            pdf_reader = PyPDF2.PdfReader(PDF_PATH)

            # Metadaten aus PDF
            meta = pdf_reader.metadata

            title = meta.title if meta and meta.title else "Unknown Title"
            authors = meta.author.split(",") if meta and meta.author else []

            # -----------------------------
            # 2. Seiteninhalt extrahieren
            # -----------------------------
            pages = []
            for i, page in enumerate(pdf_reader.pages):
                text = page.extract_text()
                pages.append({"page": i+1, "text": text})

            # -----------------------------
            # 3. Chunks bauen
            # -----------------------------
            def chunk_text(text, size=CHUNK_SIZE):
                chunks = []
                for i in range(0, len(text), size):
                    chunks.append(text[i:i+size])
                return chunks

            dataset = []
            doc_id = Path(PDF_PATH).stem

            for page in pages:
                text = page["text"]
                if not text:
                    continue

                chs = chunk_text(text)

                for idx, chunk in enumerate(chs):
                    entry = {
                        "id": doc_id,
                        "chunk_id": f"{doc_id}_p{page['page']}_c{idx}",
                        "source": PDF_PATH,
                        "title": title,
                        "authors": authors,
                        "publication_year": "None",
                        "page_start": page["page"],
                        "page_end": page["page"],
                        "text": chunk,
                        "embedding": None
                    }
                    dataset.append(entry)

            # -----------------------------
            # 4. JSON exportieren
            # -----------------------------
            with open(f"rag_dataset_{counter}.jsonl", "w", encoding="utf-8") as f:
                for item in dataset:
                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
            counter += 1
            print("FERTIG! Datei rag_dataset.jsonl generiert.")



            continue
        else:
            continue

In [11]:
import chromadb

ch = CH.chromaDBWrapper()
client = chromadb.PersistentClient("chroma")
client.get_or_create_collection(name="DUUI_300")
collection = client.get_collection("DUUI_300")


In [12]:
# Insert JSONL data into the ChromaDB

import PyPDF2
import json
from pathlib import Path
import os

# turn true if rerun of data
cast_data_DUUI = False

if cast_data_DUUI:
    embedding_structure = []
    ids_fl = []  
    metadatas_fl = []
    documents_fl = []
    uris_fl = []
    if cast_data:
        directory_str = "data/DUUIDataset/training"
        directory = os.fsencode(directory_str)
        counter = 1  
        
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            str_filename = directory_str + "/" + filename
            print(str_filename)
            if filename.endswith(".jsonl") : 
                with open(str_filename) as f:
                    data = [json.loads(line) for line in f]
                    for line in data:
                        #ids
                        ids_fl.append(line["chunk_id"])
                        # metadeta
                        if isinstance(authors, list):
                            authors = ", ".join(authors)
                        metadeta_dict = {"title": line["title"], 
                                        "authors": authors, 
                                        "publication_year": line["publication_year"], 
                                        "page_start": line["page_start"], 
                                        "page_end": line["page_end"]}
                        metadatas_fl.append(metadeta_dict)
                        # Docuemnts / text
                        documents_fl.append(line["text"])
                        # Uris
                        uris_fl.append(line["source"])
                #cast JSON arugment into proper Form


                continue
            else:
                continue
    #add metadatas
    collection.add(ids=ids_fl, metadatas=metadatas_fl, documents=documents_fl)

In [13]:
"""
df = pd.read_csv("data/BBCNews/bbc_news.csv")
df["parsed_date"] = df["pubDate"].apply(
    lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S %Z")
)
# Extract the year
df["year"] = df["parsed_date"].dt.year

# Filter dataset for everything >= 2023
filtered_df = df[df["year"] >= 2023]
smapled_df = filtered_df.sample(n=5000, random_state=42)
"""

'\ndf = pd.read_csv("data/BBCNews/bbc_news.csv")\ndf["parsed_date"] = df["pubDate"].apply(\n    lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S %Z")\n)\n# Extract the year\ndf["year"] = df["parsed_date"].dt.year\n\n# Filter dataset for everything >= 2023\nfiltered_df = df[df["year"] >= 2023]\nsmapled_df = filtered_df.sample(n=5000, random_state=42)\n'

In [14]:
import pandas as pd
from urllib.parse import urlparse
import hashlib
from datetime import datetime

cast_data_bbc = False
if cast_data_bbc:
    ids_b = []
    metadatas_b = []
    docuemnts_b = []
    unique_set = set()
    for row in smapled_df.itertuples(index=False):
        # Create a short hash from full URL
        # form year_shortURl_Hash of whole URL
        # create unique ID
        domain = urlparse(row.link).netloc.replace(".", "_")
        hash_part = hashlib.md5((row.link+row.title).encode("utf-8")).hexdigest()[:16]
        date = row.pubDate
        id = hash_part+"_"+str(row.year)
        # text/documents
        document = row.description
        #metadata
        metadeta_dict = {"title": row.title, 
                        "authors": authors, 
                                        "publication_date": row.pubDate, 
                                        "link" : row.link,
                                        "guide": row.guid}

        # check if id (hash of the )
        if id not in unique_set:
            unique_set.add(id)
            # if not in unique set
            metadatas_b.append(metadeta_dict)
            docuemnts_b.append(row.description)
            ids_b.append(id)

    client.delete_collection(name="bbc_news")
    collection_b = client.get_or_create_collection(name="bbc_news")
    collection_b.add(ids=ids_b, metadatas=metadatas_b, documents=docuemnts_b)

In [15]:
if False:    
# Filter data for Stocknews
    df = pd.read_csv("data/stockNews/nasdaq_news.csv", nrows=20000)
    df["parsed_date"] = df["Date"].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S %Z")
    )
    #2023-12-11 00:00:00 UTC
    #Sun, 06 Mar 2022 22:49:58 GMT
    # Extract the year
    df["year"] = df["parsed_date"].dt.year

    # Filter dataset for everything >= 2023
    filtered_df = df[df["year"] >= 2023]
    smapled_df = filtered_df.sample(n=5000, random_state=42)
    len(smapled_df)
    smapled_df.to_csv("filtered_nasdaq_news.csv", index=False)

In [None]:
# Process data for Nasdaq News
import tiktoken
import pandas as pd
import hashlib

tokenizer = tiktoken.get_encoding("cl100k_base")

smapled_df = pd.read_csv("filtered_nasdaq_news.csv")

def chunk_text_tiktoken(text, max_tokens=400):
    tokens = tokenizer.encode(text)
    chunks = []
    
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        
    return chunks

batch_size = 5400
token_length = 400
ids_sto = []
metadatas_sto = []
documents_sto = []
unique_set_sto = set()
cast_nasdaq_data = True
if cast_nasdaq_data:
    
    for row in smapled_df.itertuples(index=False):
        if len(ids_sto) > 5500:
            continue
        # Create a short hash from full URL
        # form year_shortURl_Hash of whole URL
        # create unique ID
        chunks_row = chunk_text_tiktoken(row.Article)
        for i, chunk in enumerate(chunks_row, 1):
            hash_part = hashlib.md5((row.Article).encode("utf-8")).hexdigest()[:16]
            date = row.Date
            id = hash_part+"_"+str(row.year)+"_"+str(i)
            # text/documents
            document = chunk
            #metadata
            metadeta_dict = {"title": row.Article_title, 
                                "stock": row.Stock_symbol,
                                "publication_date": row.year,
                                "part_of_article": i
                            }
            # check if id (hash of the )

            if id not in unique_set_sto:
                unique_set_sto.add(id)
                # if not in unique set
                metadatas_sto.append(metadeta_dict)
                documents_sto.append(document)
                ids_sto.append(id)
        
    collection_b = client.get_or_create_collection(name="nasdaq_news_chunked_new")
    collection_b.add(ids=ids_sto[:batch_size], metadatas=metadatas_sto[:batch_size], documents=documents_sto[:batch_size])

In [128]:
from pydantic import BaseModel
from openai import OpenAI
import json
import importlib
import call_llm
importlib.reload(call_llm) # Beibehalten, falls erforderlich
import pandas as pd # Hinzugefügt, da Sie es im Originalcode referenzieren
import os
import csv # NEU: Importiere das CSV-Modul

from dotenv import load_dotenv
# loading variables from .env file
load_dotenv() # pass a path if it's not a .env in the current working directory 

# Setup:
LLM_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
df = pd.read_csv("filtered_nasdaq_news.csv") # DataFrame wird geladen
 # Angenommen, das ist der DataFrame, den Sie verwenden

counter = 0
test = 0
context = ""

# Öffnen der Datei mit dem 'csv'-Modul
with open("QA_nasdaq.csv", "w", newline="", encoding="utf-8") as output_file:
    # NEU: Initialisiere den CSV-Writer
    csv_writer = csv.writer(output_file, delimiter=',')
    
    # Schreibe die Kopfzeile
    csv_writer.writerow(["question", "answer"]) 
    
    for row in df.itertuples(index=False):
        if test > 10:
             break # Verwende 'break' anstelle von 'continue', um die Schleife zu beenden
        
        context += row.Article
        counter += 1
        
        if counter == 5:
              # Annahme: call_llm.gen_QA_nasdaq gibt den JSON-String korrekt zurück
              llm_answer = call_llm.gen_QA_nasdaq(model=LLM_client, context=context).output_text
              context = ""
              counter = 0
              test += 1
              
              try:
                  llm_answer_json = json.loads(llm_answer)
                  question = llm_answer_json.get("question", "")
                  answer = llm_answer_json.get("answer", "")
                  
                  print(llm_answer_json)
                  
                  # NEU: Schreibe die Zeile mit dem CSV-Writer
                  # Das CSV-Modul kümmert sich um die korrekte Formatierung (Anführungszeichen, Trennzeichen)
                  csv_writer.writerow([question, answer])
                  print("row added"+ str(test))
              except json.JSONDecodeError as e:
                  print(f"Fehler beim Dekodieren von JSON: {e}. Antwort: {llm_answer}")
              except Exception as e:
                  print(f"Ein unerwarteter Fehler ist aufgetreten: {e}")

# Die Datei wird automatisch geschlossen, da 'with open' verwendet wird
print("CSV-Erstellung abgeschlossen.")

{'question': "Based on the provided information, what were CSX's cash and cash equivalents at the end of 2022, how did this figure relate to its current debt, and what conclusion does the text draw from this comparison?", 'answer': "CSX's cash and cash equivalents were $2,087 million at the end of 2022, while its current debt was $151 million, implying that the company has sufficient cash to meet its current debt obligations."}
row added1
{'question': "According to the report, what EU rule does Apple's anti-steering obligation allegedly breach, and which company filed the 2019 complaint that sparked the case?", 'answer': 'EU rules against unfair trading conditions; Spotify.'}
row added2
{'question': 'What two changes does the SAVE repayment plan introduce for undergraduate loans, as described by Matt Frankel, regarding discretionary income payments and the poverty-line threshold?', 'answer': 'Discretionary payments are reduced from 10% to 5% of discretionary income, and the threshold f

In [129]:
pd.read_csv("QA_nasdaq.csv")

Unnamed: 0,question,answer
0,"Based on the provided information, what were C...","CSX's cash and cash equivalents were $2,087 mi..."
1,"According to the report, what EU rule does App...",EU rules against unfair trading conditions; Sp...
2,What two changes does the SAVE repayment plan ...,Discretionary payments are reduced from 10% to...
3,What position will Natascha Viljoen assume aft...,Chief operating officer at Newmont Corporation.
4,"According to the report, what factors contribu...",It was the first day of seasonally slow August...
5,What was Apple's revenue in the second quarter...,Revenue was $94.8 billion in the second quarte...
6,"According to the article, which five ETFs domi...",Invesco QQQ Trust (QQQ); iShares Core U.S. Agg...
7,Among the three dividend stocks highlighted—Mi...,Starbucks.
8,According to ETF Channel's analysis of PBUS's ...,Implied target price: $46.02 per unit; Upside:...
9,Which real estate investment trust is describe...,Realty Income (NYSE:O)


In [None]:
#import wikipedia
from transformers import BertTokenizer, BertModel, AutoTokenizer
import torch
import cBert.wrapper_CBert as cBert
import weighting_prompt as wp
import chromadb_handler as ch
import call_llm
import json
import pandas as pd
import chromadb
from openai import OpenAI
from dotenv import load_dotenv
import os
import rag_data
importlib.reload(call_llm)
importlib.reload(rag_data)
importlib.reload(ch)


if __name__ == "__main__":
   # init model
   MODEL_NAME_2 = "gpt-5-nano-2025-08-07"
   MODEL_NAME = "gpt-3.5-turbo-0125"
   load_dotenv()
   LLM_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
   # 1. Stelle eine Frage 
   
   # Open output file
   # write header: Question, Ideal_answer, reponse_1{metadaten}, response_2{metadaten}, comparission{metadaten}

   client = chromadb.PersistentClient("src/chroma")
   collection = client.get_or_create_collection("nasdaq_news_chunked")

   data_file = open("data_compare.csv", "w", encoding="utf-8")
   QA_nasdaq = pd.read_csv("QA_nasdaq.csv", sep=",")
   results_list = []

   # init chromadbHandler


   for row in QA_nasdaq.itertuples(index=False):
      question = row.question
      ideal_answer = row.answer
      
      # Method 1: topics_formatted, query_summary_results, answer_llm_chunked_rag
      response_categories = call_llm.call_llm_categories(row.question, LLM_client)
      response_categories_json = json.loads(response_categories.output_text)

      topics_formatted = [response_categories_json["subQuerie1"], response_categories_json["subQuerie2"], response_categories_json["subQuerie3"]]
      query_summary_results = collection.query(query_texts=topics_formatted, n_results=3)
      query_summary_results_formatted = query_summary_results["documents"][0]
      answer_llm_Chunked_Rag = call_llm.call_llm_answer_RAG(query_results=query_summary_results_formatted, user_prompt=question, model=LLM_client)

      # Method 2: Question, query_regular_rag,  answer_llm_wRag
      query_regular_rag = collection.query(query_texts=question, n_results=3)
      context_query = query_regular_rag["documents"][0]
      asnwer_llm_wRAG = call_llm.call_llm_answer_RAG(query_results=context_query, user_prompt=question, model=LLM_client)

      # Comparisson
      response_compare = call_llm.call_llm_compare_answers(prompt=question,
                                                         ideal_answer=ideal_answer,
                                                         answer_destilled=asnwer_llm_wRAG.output_text, 
                                                         answer_regular=answer_llm_Chunked_Rag.output_text,
                                                         model=LLM_client)
      # create json with data class
      ragData = rag_data.ragData(question=question, ideal_answer=ideal_answer, 
                           comparisson=response_compare.output_text)
      formatted_response_compare = json.loads(response_compare.output_text)

      row_dict = {"question": question,
                  "answer": answer,
                  "score_1": formatted_response_compare["answer1_score"],
                  "score_2": formatted_response_compare["answer2_score"]
                  }
      results_list.append(row_dict)
      
      break

df_results = pd.DataFrame(results_list)
df_results.to_csv(
        "output_v2.csv", 
        sep=';', 
        index=False, 
        encoding='utf-8'
        )
output_file.close

UnboundLocalError: cannot access local variable 'pipeline' where it is not associated with a value

In [None]:
data_rag = pd.read_csv("output_v2.csv", sep=";")
print(data_rag.loc[:, 'score_1'].mean())
print(data_rag.loc[:, 'score_2'].mean())


0.3181818181818182
0.13636363636363635


In [160]:
import chromadb_handler
importlib.reload(chromadb_handler)

print(client.count_collections)
print(collection.query(query_texts="test"))

ch_handler = ch.ChromaDBHandler(collection=collection)

print(query_summary_results_formatted)
print(context_query)
res, scor = ch_handler.return_k_query_results("does the fish purr like a cat?")
print(res, scor)

<bound method Client.count_collections of <chromadb.api.client.Client object at 0x7462ad261610>>
{'ids': [[]], 'embeddings': None, 'documents': [[]], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[]], 'distances': [[]]}


  avg_doc_len = np.array([len(doc_ids) for doc_ids in corpus_token_ids]).mean()
  ret = ret.dtype.type(ret / rcount)
                                        

ValueError: max() iterable argument is empty

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 0.15117524564266205,
  'token': 2943,
  'token_str': 'energy',
  'sequence': 'plants create energy through a process known as photosynthesis.'},
 {'score': 0.1459401696920395,
  'token': 4870,
  'token_str': 'flowers',
  'sequence': 'plants create flowers through a process known as photosynthesis.'},
 {'score': 0.08225198835134506,
  'token': 9325,
  'token_str': 'sunlight',
  'sequence': 'plants create sunlight through a process known as photosynthesis.'},
 {'score': 0.04291250929236412,
  'token': 18670,
  'token_str': 'algae',
  'sequence': 'plants create algae through a process known as photosynthesis.'},
 {'score': 0.0376516729593277,
  'token': 7722,
  'token_str': 'oxygen',
  'sequence': 'plants create oxygen through a process known as photosynthesis.'}]