In [2]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    extracted_text = []

    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        
        # Since the text start from page 1 to 97 then im extracting it on it own
        for i in range(min(97, total_pages)):
            page = pdf.pages[i]
            text = page.extract_text()
            if text:
                extracted_text.append((i + 1, text.strip()))
    
    return extracted_text

def extract_tables_from_pdf(pdf_path):
    extracted_tables = []

    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        
        # the page 98 till 108 contains tables
        for i in range(97, min(108, total_pages)):
            page = pdf.pages[i]
            tables = page.extract_tables()
            for table in tables:
                if table:
                    extracted_tables.append((i + 1, table))
    
    return extracted_tables

def clean_extracted_text(extracted_text):
    cleaned_text = []

    for page_num, text in extracted_text:
        cleaned_text_content = text.replace('\n', ' ').strip()
        cleaned_text.append((page_num, cleaned_text_content))
    
    return cleaned_text

def clean_extracted_table(table):
    cleaned_table = []
    for row in table:
        cleaned_row = [cell.strip() if cell else "" for cell in row]
        if any(cleaned_row) and not all(cell in ["", "\uf09f"] for cell in cleaned_row):
            cleaned_table.append(cleaned_row)
    return cleaned_table

def clean_extracted_tables(extracted_tables):
    cleaned_tables = []

    for page_num, table in extracted_tables:
        cleaned_table = clean_extracted_table(table)
        cleaned_tables.append((page_num, cleaned_table))

    return cleaned_tables

# Calling fcts
pdf_path = "infos/pdf2.pdf"

#the text part
extracted_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_extracted_text(extracted_text)

#the table part
extracted_tables = extract_tables_from_pdf(pdf_path)
cleaned_tables = clean_extracted_tables(extracted_tables)


print("Cleaned Extracted Text:")
for idx, text in cleaned_text:
    print(text)
    print()

print("Cleaned Extracted Tables:")
for idx, table in cleaned_tables:
    for row in table:
        print(row)
    print()

Cleaned Extracted Text:
EUROPEAN COMMISSION Brussels, 21.4.2021 COM(2021) 206 final 2021/0106 (COD) Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL LAYING DOWN HARMONISED RULES ON ARTIFICIAL INTELLIGENCE (ARTIFICIAL INTELLIGENCE ACT) AND AMENDING CERTAIN UNION LEGISLATIVE ACTS {SEC(2021) 167 final} - {SWD(2021) 84 final} - {SWD(2021) 85 final} EN EN

EXPLANATORY MEMORANDUM 1. CONTEXT OF THE PROPOSAL 1.1. Reasons for and objectives of the proposal This explanatory memorandum accompanies the proposal for a Regulation laying down harmonised rules on artificial intelligence (Artificial Intelligence Act). Artificial Intelligence (AI) is a fast evolving family of technologies that can bring a wide array of economic and societal benefits across the entire spectrum of industries and social activities. By improving prediction, optimising operations and resource allocation, and personalising service delivery, the use of artificial intelligence can support socially and env

In [3]:
#join the tuples into one string seprated by space
all_cleaned_text = " ".join(text for _, text in cleaned_text)

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

#using recursive chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=20
)

#chunking only text since tables are an unstructred data form
docs = text_splitter.create_documents([all_cleaned_text])

print("Chunked Text:")
for doc in docs:
    print(doc.page_content)
    print()

Chunked Text:
EUROPEAN COMMISSION Brussels, 21.4.2021 COM(2021) 206 final 2021/0106 (COD) Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL LAYING DOWN HARMONISED RULES ON ARTIFICIAL INTELLIGENCE (ARTIFICIAL INTELLIGENCE ACT) AND AMENDING CERTAIN

AMENDING CERTAIN UNION LEGISLATIVE ACTS {SEC(2021) 167 final} - {SWD(2021) 84 final} - {SWD(2021) 85 final} EN EN EXPLANATORY MEMORANDUM 1. CONTEXT OF THE PROPOSAL 1.1. Reasons for and objectives of the proposal This explanatory memorandum accompanies the

accompanies the proposal for a Regulation laying down harmonised rules on artificial intelligence (Artificial Intelligence Act). Artificial Intelligence (AI) is a fast evolving family of technologies that can bring a wide array of economic and societal

and societal benefits across the entire spectrum of industries and social activities. By improving prediction, optimising operations and resource allocation, and personalising service delivery, the use of artificial int

In [48]:
from transformers import BertModel, BertTokenizer
import torch
from chromadb.utils import embedding_functions
import numpy as np

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_bert_embeddings(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
        # Average pooling of token embeddings
        pooled_embedding = torch.mean(outputs.last_hidden_state, dim=1)
    return pooled_embedding.numpy()

# whwre to store each chunk embeding
all_embeddings = []

# embed each chunk on it own
for doc in docs:
    chunk_embeddings = get_bert_embeddings(doc.page_content)
    all_embeddings.append(chunk_embeddings)

In [49]:
# # Print only the 5 first since its a big data
num_embeddings_to_display = 5 
for i, embedding in enumerate(all_embeddings[:num_embeddings_to_display]):
    print(f"Embedding for chunk {i+1}: {embedding}")

Embedding for chunk 1: [[ 6.68178499e-02 -1.05950601e-01  3.24203253e-01 -2.37304270e-01
   2.73563892e-01 -6.04243875e-01 -2.85710231e-03  1.72583967e-01
   7.06046596e-02  6.05782820e-03 -3.60839278e-01 -7.88005069e-02
  -1.57094106e-01  9.66002867e-02 -1.48684094e-02  2.37778261e-01
   3.80446970e-01 -1.58418101e-04  1.44693017e-01  2.27965444e-01
   3.70892942e-01  2.65423536e-01  9.69179720e-03  5.54290414e-01
   3.15535158e-01  1.11438580e-01  8.27242993e-03 -1.55813351e-01
  -4.10786778e-01  3.75540666e-02  2.99402565e-01  1.39144748e-01
  -3.71482491e-01 -6.67349398e-02  3.83288741e-01 -5.42146377e-02
  -2.17340559e-01 -3.86690706e-01 -5.19706666e-01  3.42667013e-01
  -2.84524262e-01 -1.21390194e-01 -7.58850724e-02 -1.65352240e-01
  -2.03516826e-01 -2.83194482e-01  5.12166440e-01  5.82058914e-03
   1.36345318e-02 -9.25327018e-02 -3.54401767e-01  3.11260730e-01
  -3.41097474e-01 -1.34754419e-01  1.49034098e-01  4.40922946e-01
  -2.34526038e-01 -7.32032299e-01 -9.84996259e-02 -1.

In [50]:
from transformers import BertModel, BertTokenizer
import torch

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

#going for each table
table_embeddings = []
for _, rows in cleaned_tables:
    # converting table to text
    table_text = ''
    for row in rows:
        row_text = ' '.join(map(str, row))
        table_text += row_text + '\n'  #sep rows with a \n
    table_text = table_text.strip()  #making sure it contains the same data as before
    
    tokens = tokenizer(table_text, padding=True, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**tokens)
        table_embedding = torch.mean(outputs.last_hidden_state, dim=1) 
        
    table_embeddings.append(table_embedding)

In [51]:
for i, embedding in enumerate(table_embeddings[:3]):  
    #print embeddings of the first 3 tables
    print(f"Embeddings for Table {i + 1}:")
    print(embedding)
    print("=" * 50)

Embeddings for Table 1:
tensor([[ 3.1180e-02, -1.4063e-01,  5.1959e-01, -7.2274e-02,  1.9908e-01,
         -1.5527e-01, -2.4035e-01,  2.1062e-01,  1.0493e-01,  8.1251e-02,
         -2.3862e-01, -2.2008e-01, -1.0513e-01,  1.0368e-01,  9.3253e-02,
          3.2974e-01,  3.5266e-01, -8.6672e-02, -9.8328e-02,  1.2571e-01,
          3.2196e-01,  2.0867e-01,  8.9617e-02,  3.7519e-01,  3.0931e-01,
          5.6261e-02,  1.9300e-01, -8.0898e-02, -2.2278e-01, -3.1007e-02,
          4.1305e-01,  1.7784e-01, -2.9058e-01, -1.7033e-01,  5.9131e-02,
          7.8784e-02, -2.9055e-01, -2.6244e-01,  2.3801e-02,  1.4785e-01,
         -2.6540e-01, -1.5977e-01, -7.2590e-02, -1.1354e-01, -8.5289e-02,
         -3.7278e-01,  5.1166e-01,  1.7606e-02, -1.4451e-01, -1.9347e-01,
         -5.2503e-01,  9.0042e-02, -4.2300e-01, -1.7727e-01,  1.5193e-01,
          4.8874e-01, -1.0382e-01, -4.3900e-01, -2.7155e-01, -2.9850e-01,
          3.7763e-01, -2.1003e-01,  1.4492e-01, -1.3841e-01,  4.0490e-02,
         -5.05

In [52]:
# storing the both embedings into one single dict
combined_embeddings = {'chunks': all_embeddings, 'tables': table_embeddings}
chroma = combined_embeddings

In [53]:
print("Number of chunk embeddings:", len(chroma['chunks']))
print("Number of table embeddings:", len(chroma['tables']))

Number of chunk embeddings: 1250
Number of table embeddings: 22


In [55]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex
from llama_index.core import VectorStore  #having an issue with importing vectorestore

llm = Ollama(model="llama3", request_timeout=120.0)

# Set the llm in the Settings
Settings.llm = llm

docs_embeddings = chroma['chunks'] + chroma['tables'] 
vector_store = VectorStore(docs_embeddings)

# Create VectorStoreIndex
index = VectorStoreIndex.from_vector_store(vector_store)

# Create QueryEngine
query_engine = index.as_query_engine(streaming=True, similarity_top_k=4)


ImportError: cannot import name 'VectorStore' from 'llama_index.core' (C:\Users\21264\OneDrive\Bureau\app\.venv\lib\site-packages\llama_index\core\__init__.py)