In [1]:
# Importazione delle librerie necessarie
from langchain_unstructured import UnstructuredLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from pathlib import Path
import os
from rag_chatbot.config import Config as cfg

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [None]:
# Classe VectorStoreManager
class VectorStoreManager:
    def __init__(self, documents_path=cfg.DOCUMENTS_PATH, vector_store_path=cfg.VECTOR_STORE_PATH):
        self.documents_path = documents_path
        self.vector_store_path = vector_store_path
        self.docs = []
        self.chunks = []
        self.vector_store = None

    def load_documents(self):
        # Carica tutti i file PDF nella directory
        pdf_files = list(Path(self.documents_path).glob('*.pdf'))
        print(f'Found {len(pdf_files)} PDF files in {self.documents_path}')
        loader = UnstructuredLoader(
            file_path=pdf_files,
            strategy='hi_res',
            partition_via_api=True,
            api_key=cfg.UNSTRUCTURED_API_KEY,
        )
        for doc in loader.lazy_load():
            self.docs.append(doc)
        print(f'Loaded {len(self.docs)} pages.')

    def split_documents(self):
        # Suddivide i documenti in chunk
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=cfg.CHUNK_SIZE, chunk_overlap=cfg.CHUNK_OVERLAP)
        self.chunks = text_splitter.split_documents(self.docs)
        print(f'Before split: {len(self.docs)} pages, after split: {len(self.chunks)} chunks.')

        for i, chunk in enumerate(self.chunks[:5]):
            print(f"Chunk {i}: {chunk.page_content[:300]}")

    def load_or_generate_vector_store(self):
        # Carica o genera il vector store
        if Path(self.vector_store_path).exists():
            print(f'Loading existing vector store from {self.vector_store_path}')
            self.vector_store = FAISS.load_local(self.vector_store_path, HuggingFaceBgeEmbeddings(
                model_name=cfg.HUGGINGFACE_MODEL_NAME,
                encode_kwargs={'normalize_embeddings': True},
            ),
            allow_dangerous_deserialization=True
            )
        else:
            print('Vector store not found. Generating new one...')
            self._generate_vector_store()

    def _generate_vector_store(self):
        # Crea un nuovo vector store
        embedding_model = HuggingFaceBgeEmbeddings(
            model_name=cfg.HUGGINGFACE_MODEL_NAME,
            encode_kwargs={'normalize_embeddings': True}
        )
        self.vector_store = FAISS.from_documents(self.chunks, embedding_model)
        self.vector_store.save_local(self.vector_store_path)
        print(f'Vector store saved to {self.vector_store_path}')


In [3]:
# Funzione per testare la ricerca nel vector store
def test_search(manager, query):
    if manager.vector_store:
        docs = manager.vector_store.similarity_search(query, top_k=3)
        for doc in docs:
            print(f'Page {doc.metadata["filename"]}: {doc.page_content[:300]}')
            # print(f"Metadata: {doc.metadata}")  # Visualizza i metadati
    else:
        print('Vector store not loaded properly.')


In [4]:
# Creazione dell'oggetto VectorStoreManager e caricamento dei documenti
manager = VectorStoreManager()


In [5]:
# Caricamento dei documenti
manager.load_documents()

Found 21 PDF files in data/documents


INFO: NumExpr defaulting to 10 threads.
  from .autonotebook import tqdm as notebook_tqdm
INFO: pikepdf C++ to Python logger bridge initialized
INFO: Reading PDF for file: data/documents/4. Text Classification.pdf ...
INFO: Reading PDF for file: data/documents/10. Transformers II.pdf ...
INFO: Reading PDF for file: data/documents/3. Math with Words.pdf ...


KeyboardInterrupt: 

In [41]:
# Suddividi i documenti in chunk
manager.split_documents()

Before split: 4713 pages, after split: 4576 chunks.
Chunk 0: Natural Corso di TIC @ D s 2
Chunk 1: Natural Language Processing and Large Language Models
Chunk 2: Corso di Laurea Magistrale in Ingegneria Informatica
Chunk 3: Lesson 4
Chunk 4: Text Classification


In [35]:
first_page_docs = [doc for doc in manager.docs if doc.metadata.get("page_number") == 1]

for doc in first_page_docs:
    print(doc.page_content)

# Print the content of the first 5 chunks
for i, chunk in enumerate(manager.chunks[:5]):
    print(f"Chunk {i}: {chunk.page_content[:300]}")

Natural Corso di TIC @ D s 2
Natural Language Processing and Large Language Models
Corso di Laurea Magistrale in Ingegneria Informatica
Lesson 4
Text Classification
Nicola Capuano and Antonio Greco DIEM – University of Salerno
DIEM
® Text ® ®
Outline
Text Classification
• Topic Labelling Example
• Sentiment Analysis Exercise
w2 — _
Natural Corso di & D & 5 2
Natural Language Processing and Large Language Models
Corso di Laurea Magistrale in Ingegneria Informatica
Lesson 10 Transformers II
Nicola Capuano and Antonio Greco DIEM – University of Salerno

DIEM
Natural Corso di TIC @ D
Natural Language Processing and Large Language Models
Corso di Laurea Magistrale in Ingegneria Informatica
Lesson 3 Math with Words
Nicola Capuano and Antonio Greco DIEM – University of Salerno
DIEM
• TF-IDF 
Outline
Term Frequency
Vector Space Model
• Building a Search Engine
w2 — _
Natural Corso di & D ¢ 0
Natural Language Processing and Large Language Models
Corso di Laurea Magistrale in Ingegneria Informat

In [36]:
# Carica o genera il vector store
manager.load_or_generate_vector_store()

INFO: Use pytorch device_name: mps
INFO: Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5


Loading existing vector store from data/vector_store


In [37]:
# Esegui la ricerca per un termine di esempio
query = 'prompt engineering'
test_search(manager, query)

Page 0. Course Introduction.pdf: Prompt Engineering
Page 18. Prompt Engineering.pdf: Prompt Engineering
Page 18. Prompt Engineering.pdf: Prompt Engineering
Page 18. Prompt Engineering.pdf: Introduction to Prompt Engineering


In [None]:
print(len(manager.docs))

4713
<class 'langchain_core.documents.base.Document'>
