In [26]:
import torch 
import chromadb_handler as CH
import pandas as pd

In [27]:
import PyPDF2
import json
from pathlib import Path
import os

# turn true if rerun of data
cast_data = False

if cast_data:
    directory_str = "data/DUUIDataset/training"
    directory = os.fsencode(directory_str)
    counter = 1    
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        print(filename)
        if filename.endswith(".pdf") : 
            PDF_PATH = directory_str + "/" + filename
            CHUNK_SIZE = 300  # Anzahl Zeichen, ≈ 500–700 Tokens

            # -----------------------------
            # 1. PDF Laden
            # -----------------------------
            pdf_reader = PyPDF2.PdfReader(PDF_PATH)

            # Metadaten aus PDF
            meta = pdf_reader.metadata

            title = meta.title if meta and meta.title else "Unknown Title"
            authors = meta.author.split(",") if meta and meta.author else []

            # -----------------------------
            # 2. Seiteninhalt extrahieren
            # -----------------------------
            pages = []
            for i, page in enumerate(pdf_reader.pages):
                text = page.extract_text()
                pages.append({"page": i+1, "text": text})

            # -----------------------------
            # 3. Chunks bauen
            # -----------------------------
            def chunk_text(text, size=CHUNK_SIZE):
                chunks = []
                for i in range(0, len(text), size):
                    chunks.append(text[i:i+size])
                return chunks

            dataset = []
            doc_id = Path(PDF_PATH).stem

            for page in pages:
                text = page["text"]
                if not text:
                    continue

                chs = chunk_text(text)

                for idx, chunk in enumerate(chs):
                    entry = {
                        "id": doc_id,
                        "chunk_id": f"{doc_id}_p{page['page']}_c{idx}",
                        "source": PDF_PATH,
                        "title": title,
                        "authors": authors,
                        "publication_year": "None",
                        "page_start": page["page"],
                        "page_end": page["page"],
                        "text": chunk,
                        "embedding": None
                    }
                    dataset.append(entry)

            # -----------------------------
            # 4. JSON exportieren
            # -----------------------------
            with open(f"rag_dataset_{counter}.jsonl", "w", encoding="utf-8") as f:
                for item in dataset:
                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
            counter += 1
            print("FERTIG! Datei rag_dataset.jsonl generiert.")



            continue
        else:
            continue

In [28]:
import chromadb

ch = CH.chromaDBWrapper()
client = chromadb.PersistentClient("chroma")
client.get_or_create_collection(name="DUUI_300")
collection = client.get_collection("DUUI_300")


In [29]:
# Insert JSONL data into the ChromaDB

import PyPDF2
import json
from pathlib import Path
import os

# turn true if rerun of data
cast_data_DUUI = False

if cast_data_DUUI:
    embedding_structure = []
    ids_fl = []  
    metadatas_fl = []
    documents_fl = []
    uris_fl = []
    if cast_data:
        directory_str = "data/DUUIDataset/training"
        directory = os.fsencode(directory_str)
        counter = 1  
        
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            str_filename = directory_str + "/" + filename
            print(str_filename)
            if filename.endswith(".jsonl") : 
                with open(str_filename) as f:
                    data = [json.loads(line) for line in f]
                    for line in data:
                        #ids
                        ids_fl.append(line["chunk_id"])
                        # metadeta
                        if isinstance(authors, list):
                            authors = ", ".join(authors)
                        metadeta_dict = {"title": line["title"], 
                                        "authors": authors, 
                                        "publication_year": line["publication_year"], 
                                        "page_start": line["page_start"], 
                                        "page_end": line["page_end"]}
                        metadatas_fl.append(metadeta_dict)
                        # Docuemnts / text
                        documents_fl.append(line["text"])
                        # Uris
                        uris_fl.append(line["source"])
                #cast JSON arugment into proper Form


                continue
            else:
                continue
    #add metadatas
    collection.add(ids=ids_fl, metadatas=metadatas_fl, documents=documents_fl)

In [30]:
"""
df = pd.read_csv("data/BBCNews/bbc_news.csv")
df["parsed_date"] = df["pubDate"].apply(
    lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S %Z")
)
# Extract the year
df["year"] = df["parsed_date"].dt.year

# Filter dataset for everything >= 2023
filtered_df = df[df["year"] >= 2023]
smapled_df = filtered_df.sample(n=5000, random_state=42)
"""

'\ndf = pd.read_csv("data/BBCNews/bbc_news.csv")\ndf["parsed_date"] = df["pubDate"].apply(\n    lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S %Z")\n)\n# Extract the year\ndf["year"] = df["parsed_date"].dt.year\n\n# Filter dataset for everything >= 2023\nfiltered_df = df[df["year"] >= 2023]\nsmapled_df = filtered_df.sample(n=5000, random_state=42)\n'

In [31]:
import pandas as pd
from urllib.parse import urlparse
import hashlib
from datetime import datetime

cast_data_bbc = False
if cast_data_bbc:
    ids_b = []
    metadatas_b = []
    docuemnts_b = []
    unique_set = set()
    for row in smapled_df.itertuples(index=False):
        # Create a short hash from full URL
        # form year_shortURl_Hash of whole URL
        # create unique ID
        domain = urlparse(row.link).netloc.replace(".", "_")
        hash_part = hashlib.md5((row.link+row.title).encode("utf-8")).hexdigest()[:16]
        date = row.pubDate
        id = hash_part+"_"+str(row.year)
        # text/documents
        document = row.description
        #metadata
        metadeta_dict = {"title": row.title, 
                        "authors": authors, 
                                        "publication_date": row.pubDate, 
                                        "link" : row.link,
                                        "guide": row.guid}

        # check if id (hash of the )
        if id not in unique_set:
            unique_set.add(id)
            # if not in unique set
            metadatas_b.append(metadeta_dict)
            docuemnts_b.append(row.description)
            ids_b.append(id)

    client.delete_collection(name="bbc_news")
    collection_b = client.get_or_create_collection(name="bbc_news")
    collection_b.add(ids=ids_b, metadatas=metadatas_b, documents=docuemnts_b)

In [32]:
# Filter data for Stocknews
df = pd.read_csv("data/stockNews/nasdaq_news.csv", nrows=20000)
df["parsed_date"] = df["Date"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S %Z")
)
#2023-12-11 00:00:00 UTC
#Sun, 06 Mar 2022 22:49:58 GMT
# Extract the year
df["year"] = df["parsed_date"].dt.year

# Filter dataset for everything >= 2023
filtered_df = df[df["year"] >= 2023]
smapled_df = filtered_df.sample(n=5000, random_state=42)
len(smapled_df)
#test

5000

In [36]:
# Process data for Nasdaq News

ids_sto = []
metadatas_sto = []
docuemnts_sto = []
unique_set_sto = set()
cast_nasdaq_data = False
if cast_nasdaq_data:
    for row in smapled_df.itertuples(index=False):
        # Create a short hash from full URL
        # form year_shortURl_Hash of whole URL
        # create unique ID
        hash_part = hashlib.md5((row.Article).encode("utf-8")).hexdigest()[:16]
        date = row.Date
        id = hash_part+"_"+str(row.year)
        # text/documents
        document = row.Article
        #metadata
        metadeta_dict = {"title": row.Article_title, 
                            "stock": row.Stock_symbol,
                            "publication_date": row.year
                        }
        # check if id (hash of the )

        if id not in unique_set_sto:
            unique_set_sto.add(id)
            # if not in unique set
            metadatas_sto.append(metadeta_dict)
            docuemnts_sto.append(row.Article)
            ids_sto.append(id)

    collection_b = client.get_or_create_collection(name="nasdaq_news")
    collection_b.add(ids=ids_sto, metadatas=metadatas_sto, documents=docuemnts_sto)

In [37]:

output_file = open("sample_q.txt", "w", encoding="utf-8")
for row in smapled_df[:1000].itertuples(index=False):
    output_file.write(f"{row.Article} \n")
output_file.close()

In [38]:
date_str = "Mon, 07 Mar 2022 08:01:56 GMT"
dt = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %Z")

In [39]:
print("test")

test
