In [None]:
import os

if "COLAB_GPU" in os.environ:
    !pip install -U torch
    !pip install PyMuPDF
    !pip install tqdm
    !pip install sentence-transformers
    !pip install accelerate
    !pip install bitsandbytes
    !pip install flash-attn --no-build-isolation

In [None]:
from google.colab import files
uploaded = files.upload()
print(uploaded)

In [None]:
import fitz
from tqdm import tqdm

def text_formatter(text: str) -> str:
    clean = text.replace("\n", " ").strip()
    return clean

def open_and_read_pdf(uploaded: dict) -> list[dict]:
    #  uploaded dictionary se filename extract kr rha
    filename = list(uploaded.keys())[0]
    # Opens the PDF using the filename
    doc = fitz.open(stream=uploaded[filename], filename=filename)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 25,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,
                                "page text": text})

    return pages_and_texts
pages_and_texts = open_and_read_pdf(uploaded=uploaded)
pages_and_texts[:2]

In [None]:
import random

random.sample(pages_and_texts, k=3)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
 df.head(100)

In [None]:

df.describe().round(2)

In [None]:
from spacy.lang.en import English

nlp = English()


nlp.add_pipe("sentencizer")


doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2


list(doc.sents)

In [None]:
for item in tqdm(pages_and_texts):

    item["sentences"] = list(nlp(item["page text"]).sents)


    item["sentences"] = [str(sentence) for sentence in item["sentences"]]


    item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
random.sample(pages_and_texts, k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

In [None]:
chunk_size = 10
def split_list(input_list: list, chunk_size: int) -> list:
    return [input_list[i:i+chunk_size] for i in range(0, len(input_list), chunk_size)]


In [None]:
for item in tqdm(pages_and_texts):
    item["sentences_chunks"] = split_list(item["sentences"], chunk_size=chunk_size)

In [None]:
random.sample(pages_and_texts,k=2)

In [None]:

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

In [None]:
import re


pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentences_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]


        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk


        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)


len(pages_and_chunks)

In [None]:

random.sample(pages_and_chunks, k=1)

In [None]:

df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

In [None]:

min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

In [None]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

In [None]:

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")


sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]


embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))


for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

In [None]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

In [None]:
%%time


embedding_model.to("cuda")


for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

In [None]:

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [None]:
%%time


text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,
                                               convert_to_tensor=True)

text_chunk_embeddings

In [None]:

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:

text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

In [None]:
import random

import torch
import numpy as np
import pandas as pd

device = "cuda"


text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")


text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))


pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")


embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

In [None]:
text_chunks_and_embedding_df.head()

In [None]:
embeddings[0]

In [None]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device)

In [None]:

query = "what happens in switch case without break"
print(f"Query: {query}")


query_embedding = embedding_model.encode(query, convert_to_tensor=True)


from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")


top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
larger_embeddings = torch.randn(100*embeddings.shape[0], 768).to(device)
print(f"Embeddings shape: {larger_embeddings.shape}")

# Perform dot product across 168,000 embeddings
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(larger_embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

In [None]:
import textwrap
import random
import torch
import numpy as np
import pandas as pd



def print_wrapped(text):
    """Wraps and prints text for better readability."""
    print(textwrap.fill(text, width=80))

print(f"Query: '{query}'\n")
print("Results:")

for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")

    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

In [None]:
import fitz



filename = list(uploaded.keys())[0]
file_content = uploaded[filename]


doc = fitz.open(stream=file_content, filename=filename)
page = doc.load_page(436 + 25)


img = page.get_pixmap(dpi=300)


doc.close()


img_array = np.frombuffer(img.samples_mv,
                          dtype=np.uint8).reshape((img.h, img.w, img.n))


import matplotlib.pyplot as plt
plt.figure(figsize=(13, 10))
plt.imshow(img_array)
plt.title(f"Query: '{query}' | Most relevant page:")
plt.axis('off')
plt.show()

In [None]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)


    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)


vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)


print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))


print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

In [None]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """


    query_embedding = model.encode(query,
                                   convert_to_tensor=True)


    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")

    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")

        print_wrapped(pages_and_chunks[index]["sentence_chunk"])

        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [None]:
query = "what are duplicate entry sets ?"


scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

In [None]:

print_top_results_and_scores(query=query,
                             embeddings=embeddings)

In [None]:

    use_quantization_config = False
    model_id = "google/gemma-2b-it"


print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available


from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)


if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")


model_id = model_id
print(f"[INFO] Using model_id: {model_id}")


tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)


llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation)

if not use_quantization_config:
    llm_model.to("cuda")

In [None]:
llm_model

In [None]:
input_text = "what is multithreading"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

In [None]:
%%time

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

In [None]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

In [None]:

gpt4_questions = [
    "What is the difference between JDK, JRE, and JVM, and what roles do they play in Java development?",
    "How does Java achieve platform independence through the concept of bytecode?",
    "Explain the concept of object-oriented programming (OOP) in Java, with examples of its key principles.",
    "What are Java exceptions, and how does the try-catch mechanism work to handle them?",
    "Describe the purpose and usage of Java Collections Framework. What are the differences between List, Set, and Map?"
]


manual_questions = [
    "How do Java interfaces differ from abstract classes?",
    "What is the significance of the 'final' keyword in Java?",
    "How does garbage collection work in Java, and why is it important?",
    "What are the key differences between ArrayList and LinkedList in Java?",
    "Explain the purpose of the 'volatile' keyword in multithreading in Java."
]

query_list = gpt4_questions + manual_questions


In [None]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

In [None]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Provide answers specific to Java programming concepts, syntax, and practices.
Use examples and explanations where applicable to make your answer comprehensive.
Here are examples for reference:

Example 1:
Query: What are Java's access modifiers?
Answer: Java has four access modifiers: `public`, `protected`, `default` (no modifier), and `private`.
- `public` makes the member accessible from any other class.
- `protected` allows access within the same package and by subclasses.
- `default` (no modifier) restricts access to within the same package.
- `private` restricts access to within the class itself. These modifiers allow developers to enforce encapsulation and manage visibility effectively.

Example 2:
Query: What is the purpose of the `final` keyword in Java?
Answer: The `final` keyword in Java is used to restrict modifications:
- When applied to a variable, it makes the variable a constant, meaning its value cannot be changed once assigned.
- When applied to a method, it prevents the method from being overridden by subclasses.
- When applied to a class, it makes the class immutable, meaning it cannot be extended. For example:
```java
final int MAX_SPEED = 120; // Constant variable
final class Constants  // Class that cannot be extended


\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

In [None]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")


answer, context_items = ask(query=query,
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items