In [20]:
import os
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import LocalFileStore, create_kv_docstore
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from files_to_database import main_to_database
from sentence_transformers import SentenceTransformer
import chromadb

# Embedding models

In [21]:
# For Apple Silicon users: run the following code to make use of MPS (Apple's Metal Performance Shaders) for faster computation
import torch

# set device to MPS
device = torch.device("mps")

# empty cache and set memory fraction
torch.mps.empty_cache()
torch.mps.set_per_process_memory_fraction(0.9)

# choose embeddings model
multilingual = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

# local embedding model, download to cache folder
embedding_model = SentenceTransformer(multilingual, cache_folder="../Data/sentence_transformers", device=device)

embeddings_retrieve = HuggingFaceEmbeddings(model_name=multilingual, cache_folder="../Data/sentence_transformers")

# move model to MPS
embedding_model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [7]:
# # choose embeddings model
# multilingual = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"  


In [8]:
# # local embedding model, download to cache folder (using cuda instead of cpu is 3x faster)
# embedding_model = SentenceTransformer(
#     model_name_or_path=multilingual, 
#     cache_folder="../Data/sentence_transformers"
# )
# 
# embeddings_retrieve = HuggingFaceEmbeddings(
#     model_name=multilingual,
#     cache_folder="../Data/sentence_transformers"
# )

In [22]:
# print characteristics of the model
print(f"Model: {multilingual}")
print(f"Embedding size: {embedding_model.get_sentence_embedding_dimension()}")
print(f"Number of layers: {len(embedding_model[0].auto_model.encoder.layer)}")
print(f"Number of attention heads: {embedding_model[0].auto_model.config.num_attention_heads}")
print(f"Number of hidden units: {embedding_model[0].auto_model.config.hidden_size}")
print(f"Number of hidden layers: {embedding_model[0].auto_model.config.num_hidden_layers}")
print(f"Number of characters allowed in the sentence transformer: {embedding_model.max_seq_length}. Extra tokens are truncated.")

Model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Embedding size: 384
Number of layers: 12
Number of attention heads: 12
Number of hidden units: 384
Number of hidden layers: 12
Number of characters allowed in the sentence transformer: 128. Extra tokens are truncated.


# Chroma client

In [23]:
# initiate the chroma client, which is the interface to the database
database_path = "../Data/my_vectordb"
chroma_client = chromadb.PersistentClient(path=database_path)

# Retrievers

In [24]:
# define the retrievers, parent and child splitters, MAKE SURE TO CHANGE ALSO IN files_to_database.py
parent_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ".", "!", "?"],
        keep_separator=True,
        chunk_size=1000,
        chunk_overlap=0)
child_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ".", "!", "?"],
        keep_separator=True,
        chunk_size=128,
        chunk_overlap=0)

# Binnenlands Bestuur

In [11]:
# define collection
binnenlands_bestuur_db= Chroma(
    collection_name="binnenlands_bestuur",
    client=chroma_client,
    persist_directory="../Data/my_vectordb",
    embedding_function=embeddings_retrieve,
)

# define file path
csv_path = "../Data/BinnenlandsBestuur/binnenlandsbestuur_articles.csv"

# define columns to be embedded and columns to be added to the metadata
columns_to_embed = ["content"]
columns_to_metadata = ["title", "subtitle", "author", "date", "url"]

In [12]:
# path for parent splits
full_path = os.path.abspath("../Data/my_vectordb/full_documents/binnenlands_bestuur")
fs = LocalFileStore(full_path)
store = create_kv_docstore(fs)

binnenlands_bestuur_db_retriever = ParentDocumentRetriever(
    vectorstore=binnenlands_bestuur_db,
    docstore=store,
    child_splitter=child_splitter,
    child_metadata_field=columns_to_metadata,
    parent_splitter=parent_splitter,
    )

In [13]:
main_to_database(
    collection=binnenlands_bestuur_db,
    file_path=csv_path, 
    columns_to_embed=columns_to_embed, 
    columns_to_metadata=columns_to_metadata, 
    retriever=binnenlands_bestuur_db_retriever, 
    unique_identifier="url",
    chunk_size=10
)

Getting existing documents from the database...
Chunk 1: processing 10 new documents...
Chunk 2: processing 10 new documents...
Chunk 3: processing 10 new documents...
Chunk 4: processing 10 new documents...
Chunk 5: processing 10 new documents...
Chunk 6: processing 10 new documents...
Chunk 7: processing 9 new documents...
Chunk 8: processing 8 new documents...
Chunk 9: processing 6 new documents...
Chunk 10: processing 9 new documents...
Chunk 11: processing 7 new documents...
Chunk 12: processing 8 new documents...
Chunk 13: processing 10 new documents...
Chunk 14: processing 7 new documents...
Chunk 15: processing 8 new documents...
Chunk 16: processing 9 new documents...
Chunk 17: processing 10 new documents...
Chunk 18: processing 9 new documents...
Chunk 19: processing 8 new documents...
Chunk 20: processing 8 new documents...
Chunk 21: processing 9 new documents...
Chunk 22: processing 7 new documents...
Chunk 23: processing 8 new documents...
Chunk 24: processing 4 new docume

# iBestuur

In [14]:
# define collection
ibestuur_db= Chroma(
    collection_name="ibestuur",
    client=chroma_client,
    persist_directory="../../Data/my_vectordb",
    embedding_function=embeddings_retrieve,
)

# define file path
csv_path = "../Data/iBestuur/ibestuur_articles.csv"

# define columns to be embedded and columns to be added to the metadata
columns_to_embed = ["content"]
columns_to_metadata = ["url","title"]

In [15]:
# path for parent splits
full_path = os.path.abspath("../Data/my_vectordb/full_documents/ibestuur")
fs = LocalFileStore(full_path)
store = create_kv_docstore(fs)

ibestuur_db_retriever = ParentDocumentRetriever(
    vectorstore=ibestuur_db,
    docstore=store,
    child_splitter=child_splitter,
    child_metadata_field=columns_to_metadata,
    parent_splitter=parent_splitter,
    )

In [16]:
main_to_database(
    collection=ibestuur_db,
    file_path=csv_path, 
    columns_to_embed=columns_to_embed, 
    columns_to_metadata=columns_to_metadata, 
    retriever=ibestuur_db_retriever, 
    unique_identifier="url",
    chunk_size=100
)

Getting existing documents from the database...
Chunk 1: processing 100 new documents...
Chunk 2: processing 100 new documents...
Chunk 3: processing 100 new documents...
Chunk 4: processing 100 new documents...
Chunk 5: processing 100 new documents...
Chunk 6: processing 100 new documents...
Chunk 7: processing 100 new documents...
Chunk 8: processing 100 new documents...
Chunk 9: processing 100 new documents...
Chunk 10: processing 100 new documents...
Chunk 11: processing 100 new documents...
Chunk 12: processing 100 new documents...
Chunk 13: processing 100 new documents...
Chunk 14: processing 100 new documents...
Chunk 15: processing 100 new documents...
Chunk 16: processing 100 new documents...
Chunk 17: processing 100 new documents...
Chunk 18: processing 100 new documents...
Chunk 19: processing 100 new documents...
Chunk 20: processing 100 new documents...
Chunk 21: processing 100 new documents...
Chunk 22: processing 100 new documents...
Chunk 23: processing 100 new document

# Rijksoverheid

In [17]:
# define collection
rijksoverheid_db= Chroma(
    collection_name="rijksoverheid",
    client=chroma_client,
    persist_directory="../../Data/my_vectordb",
    embedding_function=embeddings_retrieve,
)

# define file path
csv_path = "../Data/Rijksoverheid/documents_ict_20200101.csv"

# define columns to be embedded and columns to be added to the metadata
columns_to_embed = ["content"]
columns_to_metadata = ["id", "type", "title", "canonical", "introduction", "lastmodified", "available", "initialdate"]

In [18]:
# path for parent splits
full_path = os.path.abspath("../Data/my_vectordb/full_documents/rijksoverheid")
fs = LocalFileStore(full_path)
store = create_kv_docstore(fs)

rijksoverheid_db_retriever = ParentDocumentRetriever(
    vectorstore=rijksoverheid_db,
    docstore=store,
    child_splitter=child_splitter,
    child_metadata_field=columns_to_metadata,
    parent_splitter=parent_splitter,
)

In [19]:
main_to_database(
    collection=rijksoverheid_db,
    file_path=csv_path, 
    columns_to_embed=columns_to_embed, 
    columns_to_metadata=columns_to_metadata, 
    retriever=rijksoverheid_db_retriever, 
    unique_identifier="id",
    chunk_size=10
)

Getting existing documents from the database...
Chunk 1: processing 10 new documents...
Chunk 2: processing 10 new documents...
Chunk 3: processing 10 new documents...
Chunk 4: processing 10 new documents...
Chunk 5: processing 10 new documents...
Chunk 6: processing 10 new documents...
Chunk 7: processing 10 new documents...
Chunk 8: processing 10 new documents...
Chunk 9: processing 10 new documents...
Chunk 10: processing 10 new documents...
Chunk 11: processing 10 new documents...
Chunk 12: processing 10 new documents...
Chunk 13: processing 10 new documents...
Chunk 14: processing 10 new documents...
Chunk 15: processing 10 new documents...
Chunk 16: processing 10 new documents...
Chunk 17: processing 10 new documents...
Chunk 18: processing 10 new documents...
Chunk 19: processing 10 new documents...
Chunk 20: processing 10 new documents...
Chunk 21: only duplicates, skipping to next chunk...
Chunk 22: only duplicates, skipping to next chunk...
Chunk 23: only duplicates, skipping

# TenderNed

In [25]:
# define collection
tenderned_db= Chroma(
    collection_name="tenderned",
    client=chroma_client,
    persist_directory="../Data/my_vectordb",
    embedding_function=embeddings_retrieve,
)

# define file path
csv_path = "../Data/TenderNed/TenderNed_2023.csv"

# define columns to be embedded and columns to be added to the metadata
columns_to_embed = ["tender_description"]
columns_to_metadata = ["ocid", "date", "initiationType", "tender_title", "tender_documents", "tender_nationalOrEuropean", "tender_noticeTypeDetails", "tender_mainProcurementCategory", "tender_awardCriteriaDetails", "tender_tenderPeriod.endDate", "tender_contractPeriod.endDate", "tender_classification.description", "tender_value.amount", "tender_lots"]

In [26]:
# path for parent splits
full_path = os.path.abspath("../Data/my_vectordb/full_documents/tenderned")
fs = LocalFileStore(full_path)
store = create_kv_docstore(fs)

tenderned_db_retriever = ParentDocumentRetriever(
    vectorstore=tenderned_db,
    docstore=store,
    child_splitter=child_splitter,
    child_metadata_field=columns_to_metadata,
    parent_splitter=parent_splitter,
)

In [27]:
main_to_database(
    collection=tenderned_db,
    file_path=csv_path, 
    columns_to_embed=columns_to_embed, 
    columns_to_metadata=columns_to_metadata, 
    retriever=tenderned_db_retriever, 
    unique_identifier="ocid",
    chunk_size=1000
)

Getting existing documents from the database...
Chunk 1: processing 1000 new documents...
Chunk 2: processing 1000 new documents...
Chunk 3: processing 1000 new documents...
Chunk 4: processing 1000 new documents...
Chunk 5: processing 1000 new documents...
Chunk 6: processing 1000 new documents...
Chunk 7: processing 1000 new documents...
Chunk 8: processing 1000 new documents...
Chunk 9: processing 1000 new documents...
Chunk 10: processing 1000 new documents...
Chunk 11: processing 1000 new documents...
Chunk 12: processing 998 new documents...
Chunk 13: processing 999 new documents...
Chunk 14: processing 1000 new documents...
Chunk 15: processing 1000 new documents...
Chunk 16: processing 1000 new documents...
Chunk 17: processing 998 new documents...
Chunk 18: processing 999 new documents...
Chunk 19: processing 1000 new documents...
Chunk 20: processing 999 new documents...
Chunk 21: processing 998 new documents...
Chunk 22: processing 1000 new documents...
Chunk 23: processing 

# Woogle

In [24]:
# # define collection
# collection_woogle = chroma_client.get_or_create_collection(name="woogle")
# 
# # define file path
# csv_path = "../Data/Woogle/woo_bodytext_2k.csv.gz"
# 
# # define columns to be embedded and columns to be added to the metadata
# columns_to_embed = ["foi_bodyTextOCR"]
# columns_to_metadata = ["foi_documentId","foi_dossierType", "foi_pageNumber"]

In [25]:
# main_to_database(collection=collection_woogle, 
#                  file_path="../Data/Woogle/woo_bodytext_2k.csv.gz", 
#                  columns_to_embed=columns_to_embed, 
#                  columns_to_metadata=columns_to_metadata, 
#                  embedding_model=embedding_model,
#                  unique_identifier="foi_documentId", 
#                  chunk_size=100000)

In [26]:
# # woogle restructure test
# # load df
# import pandas as pd
# df = pd.read_csv("../Data/Woogle/woo_bodytext_2k.csv.gz")