In [1]:
pdf_path = "cn.pdf"

import fitz
from tqdm.auto import tqdm

def text_formatter(text) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_pdf(pdf_path) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_with_text = []
    for pgNum, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_with_text.append({
            "page_number": pgNum+1,
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text)/4,
            "text": text
        })
    return pages_with_text


In [2]:
pages_with_text = open_pdf(pdf_path)

0it [00:00, ?it/s]

In [3]:
import pandas as pd

df = pd.DataFrame(pages_with_text)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,0,1,1,0.0,
1,2,1753,250,27,438.25,Don't forget to check out the Online Learning ...
2,3,0,1,1,0.0,
3,4,34,4,1,8.5,DATA COMMUNICATIONS AND NETWORKING
4,5,175,22,2,43.75,McGraw-Hill Forouzan Networking Series Titles ...


In [4]:
df.describe().round(1)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1171.0,1171.0,1171.0,1171.0,1171.0
mean,586.0,1912.6,321.8,19.5,478.2
std,338.2,710.5,118.7,9.6,177.6
min,1.0,0.0,1.0,1.0,0.0
25%,293.5,1468.0,249.0,13.0,367.0
50%,586.0,1929.0,326.0,19.0,482.2
75%,878.5,2345.5,402.0,25.0,586.4
max,1171.0,4026.0,694.0,68.0,1006.5


In [5]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1ec3acb9c50>

In [6]:
for item in tqdm(pages_with_text):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1171 [00:00<?, ?it/s]

In [7]:
df = pd.DataFrame(pages_with_text)
df.describe().round(1)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1171.0,1171.0,1171.0,1171.0,1171.0,1171.0
mean,586.0,1912.6,321.8,19.5,478.2,18.9
std,338.2,710.5,118.7,9.6,177.6,9.8
min,1.0,0.0,1.0,1.0,0.0,0.0
25%,293.5,1468.0,249.0,13.0,367.0,13.0
50%,586.0,1929.0,326.0,19.0,482.2,18.0
75%,878.5,2345.5,402.0,25.0,586.4,25.0
max,1171.0,4026.0,694.0,68.0,1006.5,58.0


In [8]:
chunk_size = 10

def split_list(input_list,slice_size = chunk_size):
    return [input_list[i:i + slice_size] for i in range(0,len(input_list), slice_size)]

In [9]:
for item in tqdm(pages_with_text):
    item["sentence_chunks"] = split_list(item["sentences"])
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1171 [00:00<?, ?it/s]

In [10]:
df = pd.DataFrame(pages_with_text)
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1171.0,1171.0,1171.0,1171.0,1171.0,1171.0,1171.0
mean,586.0,1912.625961,321.845431,19.487617,478.15649,18.87532,2.345004
std,338.182889,710.513911,118.70437,9.615184,177.628478,9.755025,1.007077
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,293.5,1468.0,249.0,13.0,367.0,13.0,2.0
50%,586.0,1929.0,326.0,19.0,482.25,18.0,2.0
75%,878.5,2345.5,402.0,25.0,586.375,25.0,3.0
max,1171.0,4026.0,694.0,68.0,1006.5,58.0,6.0


In [11]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_with_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1171 [00:00<?, ?it/s]

2746

In [12]:
df = pd.DataFrame(pages_and_chunks)
df.describe()

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,2746.0,2746.0,2746.0,2746.0
mean,595.020393,813.82957,136.033139,203.457393
std,329.674913,452.823724,72.238827,113.205931
min,2.0,3.0,1.0,0.75
25%,310.25,490.25,84.0,122.5625
50%,606.5,844.0,142.0,211.0
75%,879.0,1070.75,179.0,267.6875
max,1171.0,4004.0,572.0,1001.0


In [13]:
min_token_length = 30
pages_and_chunks_useful = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")

In [14]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")



In [15]:
%%time

embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_useful):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/2642 [00:00<?, ?it/s]

CPU times: total: 10min 25s
Wall time: 55.2 s


In [16]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_useful]

In [17]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: total: 2min 36s
Wall time: 32.6 s


tensor([[-0.0263, -0.0086, -0.0045,  ..., -0.0379,  0.0099, -0.0303],
        [-0.0134, -0.1077, -0.0024,  ..., -0.0145,  0.0182, -0.0258],
        [-0.0306, -0.0435, -0.0626,  ...,  0.0023, -0.0069, -0.0420],
        ...,
        [ 0.0060, -0.1129,  0.0275,  ..., -0.0322,  0.0018,  0.0284],
        [ 0.0127, -0.0957, -0.0211,  ..., -0.0301, -0.0308, -0.0251],
        [-0.0160, -0.0771,  0.0096,  ..., -0.0202, -0.0345, -0.0339]],
       device='cuda:0')

In [18]:
chunks_embeddings_df = pd.DataFrame(pages_and_chunks_useful)
chunks_embeddings_df.to_csv(f"{pdf_path.split('.',2)[0]}_chunks_embeddings_df.csv",index = False)

In [19]:
import random
import torch
import numpy as np
import pandas as pd

device = "cuda"

chunks_embeddings_df = pd.read_csv(f"{pdf_path.split('.',2)[0]}_chunks_embeddings_df.csv")
chunks_embeddings_df["embedding"] = chunks_embeddings_df["embedding"].apply(lambda x : np.fromstring(x.strip("[]"), sep = "  "))
embeddings = torch.tensor(np.stack(chunks_embeddings_df["embedding"].tolist(),axis=0),dtype=torch.float32).to(device)
pages_and_chunks = chunks_embeddings_df.to_dict(orient="records")

chunks_embeddings_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,2,Don't forget to check out the Online Learning ...,815,109,203.75,"[-0.0262903962, -0.00862952601, -0.00454190047..."
1,2,Student Resources The student resources are av...,604,90,151.00,"[-0.0133625735, -0.107663088, -0.00236697868, ..."
2,2,Facilitate learning through practice and revie...,323,42,80.75,"[-0.0306028742, -0.0435124524, -0.0625793859, ..."
3,5,McGraw-Hill Forouzan Networking Series Titles ...,175,22,43.75,"[-0.0135988621, -0.0270737838, 0.00158670451, ..."
4,6,DATA COMMUNICATIONS AND NETWORKING Fourth Edit...,347,52,86.75,"[-0.0180227552, -0.00187136105, -0.0370334871,..."
...,...,...,...,...,...,...
2637,1169,"SeeTCP transmission impairment, 80,88 transmis...",380,51,95.00,"[-0.0141950902, -0.11565724, -0.00269556325, 0..."
2638,1170,"real-time traffic, 916 reassembly, 38 responsi...",3479,500,869.75,"[-0.0161629748, -0.0557909757, -0.0145120407, ..."
2639,1170,"SeeVDSL very low frequency. See VLF VHF,204 vi...",127,19,31.75,"[0.00600702642, -0.112868801, 0.027525818, -0...."
2640,1171,"1134 INDEX virtual circuit IntServ, 781 virtua...",1857,266,464.25,"[0.0126872491, -0.0957111642, -0.0211406052, 0..."


In [20]:
from sentence_transformers import SentenceTransformer, util
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)



In [21]:
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

Available GPU memory: 6 GB
GPU memory: 6 | Recommended model: Gemma 2B in 4-bit precision.
use_quantization_config set to: True
model_id set to: google/gemma-2b-it


In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 

# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model) 
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory 
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU 
    llm_model.to("cuda")

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
     

In [24]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

1515268096

In [25]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 2106740736, 'model_mem_mb': 2009.14, 'model_mem_gb': 1.96}

In [26]:
from sentence_transformers import CrossEncoder
re_rank_model = CrossEncoder("mixedbread-ai/mxbai-rerank-large-v1")

def get_ref(query,embedding_model,re_rank_model):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True).to(device)

    from time import perf_counter as timer
    
    start_time = timer()
    dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
    end_time = timer()    
    val, idx = torch.topk(dot_scores, k=5)
    refs = []
    for i in idx:
        refs.append(pages_and_chunks[i.item()]["sentence_chunk"])
    
    re_ranked = re_rank_model.rank(query,refs,return_documents=True,top_k=3)
    return re_ranked

In [60]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["text"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt, context

In [63]:
def ask(query,return_context=False):
    # %%time
    context_items = get_ref(query,embedding_model,re_rank_model)
    prompt, context = prompt_formatter(query,context_items)
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                                 do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                                 min_length=512, #minimum length of answer generated
                                max_new_tokens=1024) # how many new tokens to generate from prompt 
    
    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])
    output = output_text.replace(prompt, '').replace("<bos>","").replace("<eos>","")
    
    print(f"Query: {query}\n")
    if return_context:
        print(f"Context:\n {context}\n")
    print(f"RAG answer:\n{output}")

In [69]:
ask("Explain Bus and Ring topology", return_context=True)

Query: Explain Bus and Ring topology

Context:
 - In addition, a fault or break in the bus cable stops all transmission, even between devices on the same side of the problem. The damaged area reflects signals back in the direction of origin, creating noise in both directions. Bus topology was the one of the first topologies used in the design of early local- area networks. Ethernet LANs can use a bus topology, but they are less popular now for reasons we will discuss in Chapter 13. Ring Topology In a ring topology, each device has a dedicated point-to-point con- nection with only the two devices on either side of it. A signal is passed along the ring in one direction, from device to device, until it reaches its destination. Each device in the ring incorporates a repeater. When a device receives a signal intended for another device, its repeater regenerates the bits and passes them along (see Figure 1.8). Figure 1.8 A ring topology connecting six stations Repeater Repeater Repeater Repe