In [None]:
import os
import requests

pdf="Andrew S. tanenbaum - Modern Operating Systems.pdf"
if not os.path.exists(pdf):
    url = input("Enter the URL to download the PDF: ")
    if not url:
        print("No URL provided. Exiting.")
        exit(1)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to download the PDF. Status code: {response.status_code}")
        print("Exiting.")
        exit(1)
    with open(pdf, 'wb') as file:
        file.write(response.content)
    print(f"{pdf} has been downloaded successfully.")
else:
    print(f"{pdf} already exists. Skipping download.")

Andrew S. tanenbaum - Modern Operating Systems.pdf already exists. Skipping download.


In [32]:
import pymupdf
from tqdm.auto import tqdm
doc = pymupdf.open(pdf)
page_and_text=[]
def text_format(text):
    text = text.replace('\n', ' ').strip()
    return text
for pagenumber,text in tqdm(enumerate(doc), total=len(doc), desc="Processing pages"):
    text=text.get_text()
    text = text_format(text)
    if not text:
        continue
    page_and_text.append({"page": pagenumber + 1, "total char": len(text), "text": text,"total words": len(text.split()), "total lines without nlp": len(text.split('.')),"tokens":len(text)//4})
print(f"Total pages processed: {len(page_and_text)}")
with open("page_and_text.json", "w") as f:
    import json
    json.dump(page_and_text, f, indent=4)

Processing pages: 100%|██████████| 1137/1137 [00:01<00:00, 829.99it/s]

Total pages processed: 1136





In [33]:
import pandas as pd
df =pd.DataFrame(page_and_text)
df.describe()

Unnamed: 0,page,total char,total words,total lines without nlp,tokens
count,1136.0,1136.0,1136.0,1136.0,1136.0
mean,569.5,2577.054577,440.107394,30.855634,643.895246
std,328.079259,637.874562,113.262398,17.773274,159.471717
min,2.0,5.0,1.0,1.0,1.0
25%,285.75,2245.0,382.0,24.0,560.75
50%,569.5,2711.5,462.5,29.0,677.5
75%,853.25,3068.0,523.0,34.0,767.0
max,1137.0,3781.0,634.0,146.0,945.0


In [34]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("sentencizer")
for item in tqdm(page_and_text, desc="Processing sentences"):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["total sentences"] = len(item["sentences"])

Processing sentences: 100%|██████████| 1136/1136 [00:01<00:00, 696.30it/s]


In [35]:
df= pd.DataFrame(page_and_text)
df.describe()

Unnamed: 0,page,total char,total words,total lines without nlp,tokens,total sentences
count,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0
mean,569.5,2577.054577,440.107394,30.855634,643.895246,25.816021
std,328.079259,637.874562,113.262398,17.773274,159.471717,10.516467
min,2.0,5.0,1.0,1.0,1.0,1.0
25%,285.75,2245.0,382.0,24.0,560.75,21.0
50%,569.5,2711.5,462.5,29.0,677.5,26.0
75%,853.25,3068.0,523.0,34.0,767.0,31.0
max,1137.0,3781.0,634.0,146.0,945.0,69.0


In [36]:
def split_list(input_list: list,slice_size: int=5):
    return [input_list[i:i+slice_size] for i in range(0,len(input_list),slice_size)]
for i in tqdm(page_and_text, desc="Splitting sentences into chunks"):
    i["sentence_chunks"] = split_list(i["sentences"],10)
    i["num_chunk"]=len(i["sentence_chunks"])
with open("page_and_text_with_sentences.json", "w") as f:
    json.dump(page_and_text, f, indent=4)

Splitting sentences into chunks: 100%|██████████| 1136/1136 [00:00<00:00, 282253.97it/s]


In [37]:
import random
random.sample(page_and_text, 1)

[{'page': 790,
  'total char': 1633,
  'text': 'SEC. 10.4 MEMORY MANAGEMENT IN LINUX 759 operations, and ZONE DMA32 marks this region. In addition, if the hardware, like older-generation i386, cannot directly map memory addresses above 896 MB, ZONE HIGHMEM corresponds to anything above this mark. ZONE NORMAL is anything in between them. Therefore, on 32-bit x86 platforms, the first 896 MB of the Linux address space are directly mapped, whereas the remaining 128 MB of the kernel address space are used to access high memory regions. On x86 64 ZONE HIGHMEM is not defined. The kernel maintains a zone structure for each of the three zones, and can perform memory allocations for the three zones separately. Main memory in Linux consists of three parts. The first two parts, the kernel and memory map, are pinned in memory (i.e., never paged out). The rest of mem- ory is divided into page frames, each of which can contain a text, data, or stack page, a page-table page, or be on the free list. Th

In [38]:
df=pd.DataFrame(page_and_text)
df.describe()

Unnamed: 0,page,total char,total words,total lines without nlp,tokens,total sentences,num_chunk
count,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0
mean,569.5,2577.054577,440.107394,30.855634,643.895246,25.816021,3.059859
std,328.079259,637.874562,113.262398,17.773274,159.471717,10.516467,1.029062
min,2.0,5.0,1.0,1.0,1.0,1.0,1.0
25%,285.75,2245.0,382.0,24.0,560.75,21.0,3.0
50%,569.5,2711.5,462.5,29.0,677.5,26.0,3.0
75%,853.25,3068.0,523.0,34.0,767.0,31.0,4.0
max,1137.0,3781.0,634.0,146.0,945.0,69.0,7.0


In [39]:
import re
page_and_chunk=[]
for i in tqdm(page_and_text, desc="Processing chunks"):
    for j in i["sentence_chunks"]:
        chunk_dict={}
        chunk_dict["page"] = i["page"]
        joined_sentences = " ".join(j).strip()
        joined_sentences=re.sub(r'\.([A-Za-z])', r'. \1', joined_sentences)
        chunk_dict["sentence_chunk"] = joined_sentences
        chunk_dict["total char"] = len(joined_sentences)
        chunk_dict["total words"] = len(joined_sentences.split())
        chunk_dict["tokens"] = len(joined_sentences)//4
        page_and_chunk.append(chunk_dict)
len(page_and_chunk)
with open("page_and_text.json", "w") as f:
    json.dump(page_and_chunk, f, indent=4)

Processing chunks: 100%|██████████| 1136/1136 [00:00<00:00, 25799.36it/s]


In [42]:
min_tokens = 30
for row in page_and_chunk:
    if row["tokens"] < min_tokens:
        print(row)
        print(f"Removing chunk with less than {min_tokens} tokens: {row}")

{'page': 2, 'sentence_chunk': 'MODERN OPERATING SYSTEMS FOURTH EDITION', 'total char': 39, 'total words': 5, 'tokens': 9}
Removing chunk with less than 30 tokens: {'page': 2, 'sentence_chunk': 'MODERN OPERATING SYSTEMS FOURTH EDITION', 'total char': 39, 'total words': 5, 'tokens': 9}
{'page': 3, 'sentence_chunk': 'Zilog and Z80 are registered trademarks of Zilog, Inc.', 'total char': 54, 'total words': 9, 'tokens': 13}
Removing chunk with less than 30 tokens: {'page': 3, 'sentence_chunk': 'Zilog and Z80 are registered trademarks of Zilog, Inc.', 'total char': 54, 'total words': 9, 'tokens': 13}
{'page': 7, 'sentence_chunk': 'This page intentionally left blank', 'total char': 34, 'total words': 5, 'tokens': 8}
Removing chunk with less than 30 tokens: {'page': 7, 'sentence_chunk': 'This page intentionally left blank', 'total char': 34, 'total words': 5, 'tokens': 8}
{'page': 29, 'sentence_chunk': 'This page intentionally left blank', 'total char': 34, 'total words': 5, 'tokens': 8}
Remov

In [43]:
page_and_chunk = [row for row in page_and_chunk if row["tokens"] >= min_tokens]
print(f"Total chunks after filtering: {len(page_and_chunk)}")

Total chunks after filtering: 3364


In [None]:

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2', device='cuda')

Sentence: this is a test sentence
Embedding: [ 4.49862285e-03 -5.75653315e-02 -3.01529728e-02 -1.61244329e-02
 -5.00141270e-02  3.07816043e-02 -6.27592904e-03  2.57758368e-02
  4.59072255e-02  2.00993940e-02  5.28135039e-02 -1.74854770e-02
  4.07894747e-03 -5.07866889e-02  2.04986315e-02 -7.19274208e-03
  7.44992942e-02  1.08755426e-02 -4.64666374e-02  3.82890180e-02
 -2.11628787e-02  7.09292479e-03  4.49240347e-03 -3.46549600e-02
 -4.58333381e-02  7.24481477e-04 -1.27409892e-02 -3.60509008e-02
  1.83761865e-02 -1.23439590e-02  5.52061982e-02 -1.66890305e-02
 -1.09744733e-02 -8.61221850e-02  1.53252222e-06  1.07463980e-02
 -8.18831101e-03 -3.15868743e-02 -6.89344406e-02 -1.25579815e-03
 -3.70294857e-03  6.40714914e-02  5.97620383e-03  4.63443398e-02
 -3.11793443e-02  1.49158081e-02  4.11028825e-02  2.31718104e-02
 -5.83220422e-02  7.51837343e-02  9.31286311e-04  2.65826355e-03
 -1.04740914e-02 -3.85766178e-02  7.00188801e-02  3.39933224e-02
  1.08089941e-02  2.95037199e-02  4.62359516e

In [None]:
embedding_model.to("cuda")
for i in tqdm(page_and_chunk, desc="Generating embeddings"):
    i["embedding"] = embedding_model.encode(i["sentence_chunk"], device='cuda')
page_and_chunk = [i for i in page_and_chunk if "embedding" in i]

Generating embeddings: 100%|██████████| 3364/3364 [01:08<00:00, 49.47it/s]

CPU times: total: 8min 44s
Wall time: 1min 8s





In [47]:
df= pd.DataFrame(page_and_chunk)
df.describe()

Unnamed: 0,page,total char,total words,tokens
count,3364.0,3364.0,3364.0,3364.0
mean,574.015458,867.900416,148.732759,216.598692
std,319.303789,373.957163,61.636346,93.497286
min,3.0,121.0,17.0,30.0
25%,302.0,602.0,103.0,150.0
50%,568.0,878.0,153.0,219.0
75%,846.0,1099.25,191.0,274.25
max,1137.0,2767.0,466.0,691.0


In [48]:
random.sample(page_and_chunk, 1)

[{'page': 650,
  'sentence_chunk': 'This is the do- main of cryptography and the topic of the next section. 9.5 BASICS OF CRYPTOGRAPHY Cryptography plays an important role in security. Many people are familiar with newspaper cryptograms, which are little puzzles in which each letter has been systematically replaced by a different one. These have as much to do with modern cryptography as hot dogs have to do with haute cuisine. In this section we will give a bird’s-eye view of cryptography in the computer era. As mentioned earlier, oper- ating systems use cryptography in many places. For instance, some file systems can encrypt all the data on disk, protocols like IPSec may encrypt and/or sign all',
  'total char': 667,
  'total words': 112,
  'tokens': 166,
  'embedding': array([ 4.65770550e-02, -6.43462874e-03,  2.59988569e-02, -3.19567276e-03,
         -7.16832280e-02,  1.84131432e-02,  4.54807095e-02, -2.90711671e-02,
          1.69975229e-03, -1.01198088e-02,  5.52652255e-02,  4.6468

In [51]:
save_path = "text_chunk_embeddings.csv"
df = pd.DataFrame(page_and_chunk)
df.to_csv(save_path, index=False)
print(f"Embeddings saved to {save_path}")

Embeddings saved to text_chunk_embeddings.csv


In [2]:
import numpy as np
import torch 
import random
import pandas as pd
device="cuda"
text_embeddings=pd.read_csv("text_chunk_embeddings.csv")
text_embeddings["embedding"]=(text_embeddings["embedding"]).apply(lambda x : np.fromstring(x.strip("[]"), sep=" "))
embeddings=np.stack(text_embeddings["embedding"].to_list(),axis=0)
embeddings=torch.tensor(embeddings)
text_chunks=text_embeddings.to_dict(orient="records")


In [None]:
from sentence_transformers import util , SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2', device='cuda')

In [43]:
import textwrap
def wrapper(text,width=100):
    return "\n".join(textwrap.wrap(text, width=width))
def print_result(result, text_chunks, top_k=5):
    to_be_returned = []
    for i in range(top_k):
        index=result.indices[i].item()
        score=result.values[i].item()
        text=text_chunks[index]["sentence_chunk"]
        page=text_chunks[index]["page"]
        wrapped_text = wrapper(text, width=100)
        to_be_returned.append({
            "index": index,
            "score": score,
            "text": wrapped_text,
            "page": page
        })
    return to_be_returned
def retrive_answer(query,embeddings,model,results_wanted=3, device="cuda"):
    import time
    embedding_query=model.encode(query, convert_to_tensor=True, normalize_embeddings=True)
    embedding_query=embedding_query.to(dtype=torch.float32, device=device)
    embeddings=embeddings.to(dtype=torch.float32, device=device)
    time1=time.time()
    dot_scores = util.dot_score(embedding_query, embeddings)[0]
    result = torch.topk(dot_scores, k=results_wanted)
    time2=time.time()
    print(f"Time taken for dot score on {len(embeddings)} calculations: {time2 - time1} seconds")
    return print_result(result, text_chunks, top_k=results_wanted)


In [28]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
device="cuda"
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v0.1")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v0.1").to(device)

In [29]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [44]:
def prompt_formatter(prompt,context_items):
    context = "\n".join([f"Page {item['page']}: {item['text']}" for item in context_items])
    formatted_prompt = (
        "You are a helpful assistant. Based on the following excerpts from a textbook, answer the question concisely.\n\n"
        f"{context}\n\n"
        f"Question: {prompt}\n"
        "Answer:"
    )
    return formatted_prompt
def generate_answer(input_query, context_items, model, tokenizer, device="cuda"):
    prompt = prompt_formatter(input_query, context_items)
    input_ids = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**input_ids, max_new_tokens=256)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_text
query=input("Enter your query: ")
print(f"Query: {query}")
context_item=retrive_answer(query, embeddings, embedding_model, results_wanted=2, device=device)
answer = generate_answer(query, context_item, model, tokenizer, device=device)
print(f"Generated Answer: {answer}")


Query: what are deadlocks
Time taken for dot score on 3364 calculations: 0.0 seconds
Generated Answer: You are a helpful assistant. Based on the following excerpts from a textbook, answer the question concisely.

Page 491: A common arrangement is that process A sends a request message to process B, and then blocks until B
sends back a reply message. Suppose that the request message gets lost. A is blocked waiting for the
reply. B is blocked waiting for a request asking it to do something. We hav e a deadlock. This,
though, is not the classical resource deadlock. A does not have posses- sion of some resource B
wants, and vice versa. In fact, there are no resources at all in sight. But it is a deadlock
according to our formal definition since we have a set of (two) processes, each blocked waiting for
an event only the other one can cause. This situation is called a communication deadlock to contrast
it with the more common resource deadlock.
Page 467: 436 DEADLOCKS  CHAP. 6 Deadlocks can