# document preprocessing

In [2]:
import os

TOPIC = 'crypto'

files = [file for file in os.listdir(TOPIC) if file.endswith(".pdf") or file.endswith(".txt")]

In [3]:
from helpers.text_preprocess import preprocess
import pandas as pd
from tqdm.auto import tqdm

total_chunks = 0
chunk_list = []

with tqdm(total=len(files), desc="Processing files", unit="file") as pbar:
    for file in files:
        pages_and_chunks = preprocess(f"{TOPIC}/{file}")
        chunk_count = len(pages_and_chunks)
        total_chunks += chunk_count
        chunk_list.append(pages_and_chunks)

        pbar.update(1)
        pbar.set_postfix(file=file, chunks=chunk_count, total_chunks=total_chunks)

print(f"Total chunks: {total_chunks}")

pages_and_chunks = [i for s in chunk_list for i in s]




Processing files:   0%|          | 0/1386 [00:00<?, ?file/s]

Total chunks: 109475


In [4]:
pd.DataFrame(pages_and_chunks).to_csv(f"{TOPIC}/text_chunks.csv", escapechar="\\")

# embeddings

In [5]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'

In [6]:
embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2',device=device)

In [8]:
from helpers.embeddings import embed

embeddings = embed(embedding_model, pages_and_chunks,device)

torch.save(embeddings, f"{TOPIC}/embeddings.pt")

Embedding:   0%|          | 0/109475 [00:00<?, ?chunk/s]

Embedding: 100%|██████████| 109475/109475 [11:35<00:00, 157.32chunk/s]


# rag

In [1]:
import torch 
import pandas as pd
import random
from helpers.rag import ask

TOPIC = 'crypto'

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks = pd.read_csv(f"{TOPIC}/text_chunks.csv")

pages_and_chunks = text_chunks.to_dict(orient="records")

embeddings = torch.load(f"{TOPIC}/embeddings.pt")
embeddings = torch.tensor(embeddings).to(device)



In [10]:
query = "what is a blockchain?"


In [11]:
query = query
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(
    query=query,
    embeddings=embeddings,
    llm_model_id="google/gemma-2b-it",
    embedding_model_id="all-mpnet-base-v2",
    pages_and_chunks=pages_and_chunks,
    n_resources_to_return = 6,
    temperature=0.90,
    max_new_tokens=2048,
    return_answer_only=False,
)

print(f"Answer:\n")
print(answer)
print(f"Context items:")
context_items

Query: how do you maintain security in your blockchain


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Answer:

The passage does not provide any information on how to maintain security in a
blockchain, so I cannot answer this query from the context.
Context items:


[{'page_number': 52,
  'sentence_chunk': 'crypto/REBL.txt:the way to go if you have information that you do not wish to share with every random peer However work counterproductively for trust with other parties involved in the network or for example regulatory compliance With writing permissions kept central there is need for proof that the blockchain is not being tampered with The need for an additional third party to audit activity on the chain is therefore required However with traditional third party interaction a private blockchain can be expensive and slow to audit \\\\t\\\\t 4 1 3 2 REBL\\\\xe2\\\\x80\\\\x99s solution for centralised client blockchains The REBL chain',
  'score': tensor(0.4766)},
 {'page_number': 29,
  'sentence_chunk': 'crypto/InterValue.txt:of privacy protection in Blockchain there are several solutions such as ring signature  homomorphic encryption and zero knowledge proof  P2P communication The Blockchain system uses P2P network technology to –5– connect pee