In [1]:
from langchain.text_splitter import SentenceTransformersTokenTextSplitter, RecursiveCharacterTextSplitter
import tiktoken
from openai import OpenAI
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm

In [2]:
client = OpenAI(
    api_key="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",  # This is the default and can be omitted
)

In [2]:
def split_text_with_st_splitter(text, max_tokens=1000):
    # Initialize text splitter for token-based chunking
    splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=max_tokens, chunk_overlap=50)
    splitter = splitter.from_tiktoken_encoder(encoding_name="cl100k_base")
    return splitter.split_text(text)

# Test the whole pipeline on some docs

In [3]:
def scrape_text(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract raw text
        raw_text = soup.get_text(separator="\n")
        
        # Remove empty lines and strip extra whitespace
        clean_text = "\n".join(line.strip() for line in raw_text.splitlines() if line.strip())
        
        return clean_text
    
    except requests.exceptions.RequestException as e:
        print(f"Error scraping URL {url}: {e}")
        return None

In [4]:
def split_text_with_char_splitter(text, max_chars=6000):
    # Initialize the text splitter with a character chunk size
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chars,  # Max number of characters per chunk
        chunk_overlap=200,  # Number of characters to overlap between chunks
        length_function=len
    )
    return splitter.split_text(text)

In [5]:
def embed_text_chunks(chunks, model="text-embedding-3-small"):
    embeddings = []
    for chunk in chunks:
        embeddings.append(client.embeddings.create(input = [chunk], model=model).data[0].embedding)
    return embeddings

In [6]:
# Initialize tokenizer for OpenAI's model (cl100k_base)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Function to count tokens in a text
def count_tokens(text):
    return len(tokenizer.encode(text))

In [7]:
# read URLs from the JSON file
def read_urls_from_file(file_path):
    print(f"Reading URLs from file: {file_path}")
    try:
        with open(file_path, 'r') as file:
            urls = json.load(file)
            print(f"Loaded {len(urls)} URLs from file")
            return urls
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading URLs from file: {e}")
        return []

In [None]:
URLS_FILE = "non_pdf_urls.json"
urls = read_urls_from_file(URLS_FILE)
urls = urls[:5]
for url in tqdm(urls):
        print(f"Processing URL: {url}")
        # basic
        text = scrape_text(url)
        print(f"Raw text from url {url}: {text}\n\n")
        if text:
            chunks = split_text_with_char_splitter(text)
            print(f"Number of chunks: {len(chunks)}\n")
            for i, chunk in enumerate(chunks):
                print(f"Token number of chunk {i+1}: {count_tokens(chunk)}")
                print(f"Chunk {i+1}: {chunk}\n")
            # embeddings = embed_text_chunks(chunks)
            # print(f"Number of embeddings: {len(embeddings)}\n")

# Check OpenSearch Stats

In [1]:
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
opensearch_user = os.getenv('OPENSEARCH_USER')
opensearch_password = os.getenv('OPENSEARCH_PASSWORD')

In [11]:
from opensearchpy import OpenSearch
from utils import create_opensearch_client

# Initialize the client for OpenSearch
client = create_opensearch_client(username=opensearch_user, password=opensearch_password)

index_name = "eur-lex-diversified-knowledge-base"

# Fetch index statistics
stats = client.indices.stats(index=index_name, metric='fielddata,segments')

# Extract and print relevant memory usage stats
field_data_usage = stats["indices"][index_name]["total"]["fielddata"]["memory_size_in_bytes"]
segment_memory_usage = stats["indices"][index_name]["total"]["segments"]["memory_in_bytes"]

print(f"Index '{index_name}':")
print(f"  Field Data Usage: {field_data_usage / (1024 ** 2):.2f} MB")
print(f"  Segment Memory Usage: {segment_memory_usage / (1024 ** 2):.2f} MB")


Index 'eur-lex-diversified-knowledge-base':
  Field Data Usage: 0.00 MB
  Segment Memory Usage: 0.00 MB


In [7]:
doc_count = client.count(index=index_name)["count"]
print(f"Number of documents in '{index_name}': {doc_count}")

Number of documents in 'eur-lex-diversified-knowledge-base': 11305


In [8]:
stats = client.indices.stats(index=index_name)
print(stats["indices"][index_name]["total"])

{'docs': {'count': 11305, 'deleted': 0}, 'store': {'size_in_bytes': 477183933, 'reserved_in_bytes': 0}, 'indexing': {'index_total': 11305, 'index_time_in_millis': 40328, 'index_current': 0, 'index_failed': 0, 'delete_total': 0, 'delete_time_in_millis': 0, 'delete_current': 0, 'noop_update_total': 0, 'is_throttled': False, 'throttle_time_in_millis': 0, 'doc_status': {}}, 'get': {'total': 0, 'time_in_millis': 0, 'exists_total': 0, 'exists_time_in_millis': 0, 'missing_total': 0, 'missing_time_in_millis': 0, 'current': 0}, 'search': {'open_contexts': 0, 'query_total': 1, 'query_time_in_millis': 0, 'query_current': 0, 'concurrent_query_total': 0, 'concurrent_query_time_in_millis': 0, 'concurrent_query_current': 0, 'concurrent_avg_slice_count': 0.0, 'fetch_total': 1, 'fetch_time_in_millis': 0, 'fetch_current': 0, 'scroll_total': 0, 'scroll_time_in_millis': 0, 'scroll_current': 0, 'point_in_time_total': 0, 'point_in_time_time_in_millis': 0, 'point_in_time_current': 0, 'suggest_total': 0, 'sug

In [13]:
response = client.search(
    index=index_name,
    body={
        "query": {
            "match_all": {}  # You can change this query to something more specific if needed
        },
        "size": 3  # Limit to 3 documents
    }
)

# Print out the retrieved documents
for doc in response['hits']['hits']:
    print(doc['_source'])

{'url': 'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:12016E013', 'chunk_id': 1, 'text': 'Top\nTable of contents\nHide table of contents\nThis site is managed by the\nPublications Office of the European Union\nNeed help?\nHelp pages\nContact\nSitemap\nFollow us\nX\nLegal\nLegal notice\nCookies policy\nAccessibility\nPrivacy statement\nInformation\nAbout EUR-Lex\nNewsletter\nUseful links\nOther services\nEuropean Data\nEU tenders\nEU research results\nEU Whoiswho\nEU publications\nN-Lex\nEU Law in Force\nEU Law Tracker\nDiscover more on\neuropa.eu\nContact the EU\nCall us 00 800 6 7 8 9 10 11\nUse other telephone options\nWrite to us via our contact form\nMeet us at one of the EU centres\nSocial media\nSearch for EU social media channels\nLegal\nLanguages on our websites\nPrivacy policy\nLegal notice\nCookies\nEU institutions\nEuropean Parliament\nEuropean Council\nCouncil of the European Union\nEuropean Commission\nCourt of Justice of the European Union (CJEU)\nEuropean Ce

In [15]:
# Check memory usage related to fielddata
stats = client.indices.stats(index=index_name, metric="fielddata")
fielddata_usage = stats["indices"][index_name]["total"]["fielddata"]["memory_size_in_bytes"]
print(f"Fielddata Memory Usage: {fielddata_usage / (1024 ** 2):.2f} MB")


Fielddata Memory Usage: 0.00 MB


# Creating new index

In [4]:
from opensearchpy import OpenSearch
from utils import create_opensearch_client

In [None]:
client = create_opensearch_client(username=opensearch_user, password=opensearch_password)
old_index_name = "eur-lex-diversified-knowledge-base"
new_index_name = "eur-lex-diversified-knowledge-base-2"
# response = client.indices.delete(index=new_index_name)

def create_new_index():
    # Define the new index mapping with fielddata set to False for the 'text' field
    index_mapping = {
        "mappings": {
            "properties": {
                "url": {
                    "type": "keyword"
                },
                "chunk_id": {
                    "type": "integer"
                },
                "text": {
                    "type": "text",
                    "fielddata": False 
                },
                "embedding": {
                    "type": "knn_vector",
                    "dimension": 1536  
                }
            }
        }
    }


    response = client.indices.create(index=new_index_name, body=index_mapping, ignore=400)

    # Check the response
    if response.get("acknowledged"):
        print(f"Created new index: {new_index_name} with the specified mapping.")
    else:
        print(f"Failed to create new index: {new_index_name}")
        print(response)


In [6]:
def reindex_data():
    """
    Reindex data from the old index to the new index.
    """
    reindex_body = {
        "source": {
            "index": old_index_name
        },
        "dest": {
            "index": new_index_name
        }
    }

    response = client.reindex(body=reindex_body, wait_for_completion=True)
    if response.get("created", 0) > 0:
        print(f"Reindexed {response['created']} documents from {old_index_name} to {new_index_name}")
    else:
        print("Reindexing failed or no documents were transferred.")

In [7]:
def verify_new_index():
    """
    Verify that the new index has documents.
    """
    response = client.search(index=new_index_name, body={"size": 3})
    print(f"Verified new index: {new_index_name} with the following documents:")
    for hit in response['hits']['hits']:
        print(hit["_source"])

In [8]:
create_new_index()
# Reindex data from the old index to the new index
reindex_data()
# Verify the new index
verify_new_index()

Created new index: eur-lex-diversified-knowledge-base-2 with the specified mapping.
Reindexed 11305 documents from eur-lex-diversified-knowledge-base to eur-lex-diversified-knowledge-base-2
Verified new index: eur-lex-diversified-knowledge-base-2 with the following documents:
{'url': 'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:12016E013', 'chunk_id': 1, 'text': 'Top\nTable of contents\nHide table of contents\nThis site is managed by the\nPublications Office of the European Union\nNeed help?\nHelp pages\nContact\nSitemap\nFollow us\nX\nLegal\nLegal notice\nCookies policy\nAccessibility\nPrivacy statement\nInformation\nAbout EUR-Lex\nNewsletter\nUseful links\nOther services\nEuropean Data\nEU tenders\nEU research results\nEU Whoiswho\nEU publications\nN-Lex\nEU Law in Force\nEU Law Tracker\nDiscover more on\neuropa.eu\nContact the EU\nCall us 00 800 6 7 8 9 10 11\nUse other telephone options\nWrite to us via our contact form\nMeet us at one of the EU centres\nSocial media\

In [10]:
new_index_name = "eur-lex-diversified-knowledge-base-2"	
response = client.count(index=new_index_name)
doc_count = response['count']
print(f"Index '{new_index_name}' has {doc_count} documents.")

Index 'eur-lex-diversified-knowledge-base-2' has 11417 documents.


# Indexing PDFs

In [24]:
from markitdown import MarkItDown
import json
from tqdm import tqdm
import requests

In [25]:
def download_pdf(url, save_path):
    """
    Downloads a PDF from a URL and saves it locally.
    :param url: URL of the PDF to download
    :param save_path: Local path to save the downloaded PDF
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(save_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"PDF downloaded successfully: {save_path}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading PDF: {e}")

In [26]:
def convert_pdf_to_markdown(pdf_path):
    """
    Converts a PDF to markdown text using MarkItDown.
    :param pdf_path: Local path to the PDF file
    :return: Markdown text content of the PDF
    """
    try:
        md = MarkItDown()
        result = md.convert(pdf_path)
        return result.text_content
    except Exception as e:
        print(f"Error converting PDF to markdown: {e}")
        return None

In [None]:
PDF_URLS_FILE = "pdf_urls.json"
temp_folder ="temp_pdfs"
os.makedirs(temp_folder, exist_ok=True)

with open(PDF_URLS_FILE, 'r') as file:
        pdf_urls = json.load(file)

for url in tqdm(pdf_urls[:10]):
    print(f"Processing PDF: {url}")
    pdf_path = os.path.join(temp_folder, "temp_pdf.pdf")
    download_pdf(url, pdf_path)
    markdown_content = convert_pdf_to_markdown(pdf_path)
    os.remove(pdf_path)
    print(f"Markdown content: {markdown_content}\n\n")


Detect how many pdfs contain tabular data.

In [None]:
import pdfplumber

def contains_table(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract tables from the page
                tables = page.extract_tables()
                if tables:
                    print(f"Found {len(tables)} table(s) on page {page.page_number}")
                    return True
        return False
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return False


PDF_URLS_FILE = "pdf_urls.json"
temp_folder ="temp_pdfs"
counter = 0
os.makedirs(temp_folder, exist_ok=True)

with open(PDF_URLS_FILE, 'r') as file:
        pdf_urls = json.load(file)

for url in tqdm(pdf_urls):
    print(f"Processing PDF: {url}")
    pdf_path = os.path.join(temp_folder, "temp_pdf.pdf")
    download_pdf(url, pdf_path)
    if contains_table(pdf_path):
        counter += 1
    if os.path.exists(pdf_path):
        os.remove(pdf_path)
print(f"Number of PDFs containing tables: {counter}")

In [30]:
new_index_name = "eur-lex-diversified-knowledge-base-only-pdfs"
response = client.count(index=new_index_name)
doc_count = response['count']
print(f"Index '{new_index_name}' has {doc_count} documents.")

Index 'eur-lex-diversified-knowledge-base-only-pdfs' has 3497 documents.


In [31]:
response = client.search(
    index=new_index_name,
    body={
        "query": {
            "match_all": {}  # You can change this query to something more specific if needed
        },
        "size": 3  # Limit to 3 documents
    }
)

# Print out the retrieved documents
for doc in response['hits']['hits']:
    print(doc['_source'])

{'url': 'http://www.europarl.europa.eu/EPRS/AskEP_Insight_April-June_2014-FINAL2.pdf', 'chunk_id': 0, 'text': 'At a glance\n\nAsk EP\n\nCitizens’ Enquiries Unit\n\nApril  — June 2014 / No 7\n\nInsight\n\nThe Citizens’ Enquiries Unit (Ask EP) replies to citizens’ requests for information on the European Parliament’s activities,\npowers and organisation, always with the aim of giving them a better understanding of Parliament and its work. ‘Ask\nEP — Insight’ is a review whose main objective is to pass on citizens’ concerns to Members of the European Parliament\nand the institution as a whole.\n\nEuropean elections 2014 and their impact\n\n© European Union 2014 - EP\n\nThe  European  elections  in  May  were  by\nfar  the  most  popular  topic  in  the  past  few\nmonths with citizens writing to the European\nParliament on a variety of issues, both during\nthe pre- and post-election period and even\non election day itself.\n\nBefore ...\n\nPrior  to  the  elections,  citizens  frequently\

# Move pdf documents to knowledge base index

In [33]:
# Configuration
source_index = "eur-lex-diversified-knowledge-base-only-pdfs"
target_index = "eur-lex-diversified-knowledge-base-2"

def reindex_data(client, source_index, target_index):
    body = {
        "source": {"index": source_index},
        "dest": {"index": target_index}
    }

    try:
        response = client.reindex(body=body, wait_for_completion=True)
        print(f"Reindexing completed: {response}")
    except Exception as e:
        print(f"Error during reindexing: {e}")

reindex_data(client, source_index, target_index)

Reindexing completed: {'took': 3784, 'timed_out': False, 'total': 3497, 'updated': 0, 'created': 3497, 'deleted': 0, 'batches': 4, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []}


In [34]:
new_index_name = "eur-lex-diversified-knowledge-base-2"
response = client.count(index=new_index_name)
doc_count = response['count']
print(f"Index '{new_index_name}' has {doc_count} documents.")

Index 'eur-lex-diversified-knowledge-base-2' has 14914 documents.
